In [1]:
class color_class:
    BOLD_COLOR = '\033[1m' + '\033[93m' +  '\033[94m' +  '\033[95m' +  '\033[91m' +  '\033[92m'
    BOLD = '\033[1m'
    #END = '\033[0m'
    
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    END = '\033[0m'
   # BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

print(color_class.OKBLUE + '\nImporting all the required libraries....\n\n'+ color_class.END)

import warnings 
warnings.filterwarnings("ignore")



# Base libraries
import os
import numpy as np
import pandas as pd
import re
import string
import glob
import math
from IPython.display import display_html
import tqdm
!pip install wandb
import wandb


## visualization libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as patches
import seaborn as sns
!pip install pywaffle
from pywaffle import Waffle


# stat tools
import statsmodels.api as sm
from scipy.stats import kurtosis, skew

## preprocessing & otherlibraries
from sklearn.model_selection import (train_test_split, 
                                     cross_val_score,
                                     StratifiedKFold, 
                                     GridSearchCV)


from sklearn.preprocessing import (StandardScaler,
                                   MinMaxScaler,
                                   RobustScaler)


## data sampling and outlier detection libraries

from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

!pip install umap-learn[plot]
!pip install holoviews
!pip install -U ipykernel
from umap import UMAP
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE,RandomOverSampler



# modeling
from sklearn.linear_model import (LinearRegression, 
                                  LogisticRegression) 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, plot_importance, early_stopping
from sklearn.ensemble import (AdaBoostClassifier,
                              ExtraTreesClassifier,
                              RandomForestClassifier, 
                              GradientBoostingClassifier)


# metrics
from sklearn.metrics import (r2_score, 
                             accuracy_score,
                             roc_auc_score, 
                             f1_score,
                             recall_score, 
                             precision_score, 
                             recall_score,
                             confusion_matrix)

Importing all the required libraries....


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.0-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 54.4 MB/s eta 0:00:00
Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.9/dist-packages (from wandb) (1.4.4)
Requirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from wandb) (2.27.1)
Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/local/lib/python3.9/dist-packages (from wandb) (8.1.3)
Requirement already satisfied: PyYAML in /usr/local/lib/python3.9/dist-packages (from wandb) (6.0)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.20.0-py2.py3-none-any.whl (198 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 198.8/198.8 kB 19.5 MB/s eta 0:00:00
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Requirement already satisfied: protobuf!=4.21.0,<5,>=3.15.0 in /usr/local/lib/python3.9/dist-packages (from wandb) (3.20.3)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... done
Collecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 184.3/184.3 kB 18.2 MB/s eta 0:00:00
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from wandb) (67.6.1)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from wandb) (4.5.0)
Requirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.9/dist-packages (from wandb) (5.9.5)
Requirement already satisfied: six>=1.4.0 in /usr/local/lib/python3.9/dist-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.7/62.7 kB 7.9 MB/s eta 0:00:00
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests<3,>=2.0.0->wandb) (1.26.15)
Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests<3,>=2.0.0->wandb) (2.0.12)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests<3,>=2.0.0->wandb) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests<3,>=2.0.0->wandb) (3.4)
Collecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Building wheels for collected packages: pathtools
  Building wheel for pathtools (setup.py) ... done
  Created wheel for pathtools: filename=pathtools-0.1.2-py3-none-any.whl size=8807 sha256=d53ae6f4cd2a77d4a60ec6378c509f2c0c9d068d58a868c49c0fc15cd55b2649
  Stored in directory: /root/.cache/pip/wheels/b7/0a/67/ada2a22079218c75a88361c0782855cc72aebc4d18d0289d05
Successfully built pathtools
Installing collected packages: pathtools, smmap, setproctitle, sentry-sdk, docker-pycreds, gitdb, GitPython, wandb
Successfully installed GitPython-3.1.31 docker-pycreds-0.4.0 gitdb-4.0.10 pathtools-0.1.2 sentry-sdk-1.20.0 setproctitle-1.3.2 smmap-5.0.0 wandb-0.15.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pywaffle
  Downloading pywaffle-1.1.0-py2.py3-none-any.whl (30 kB)
Collecting fontawesomefree
  Downloading fontawesomefree-6.4.0-py3-none-any.whl (25.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 25.2/25.2 MB 58.6 MB/s eta 0:00:00
Requirement already satisfied: matplotlib in /usr/local/lib/python3.9/dist-packages (from pywaffle) (3.7.1)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (3.0.9)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (0.11.0)
Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (8.4.0)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (4.39.3)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (1.4.4)
Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (5.12.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (1.0.7)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (2.8.2)
Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (1.22.4)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (23.1)
Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib->pywaffle) (3.15.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.7->matplotlib->pywaffle) (1.16.0)
Installing collected packages: fontawesomefree, pywaffle
Successfully installed fontawesomefree-6.4.0 pywaffle-1.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn[plot]
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 88.2/88.2 kB 6.5 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.22.4)
Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.2.2)
Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.10.1)
Requirement already satisfied: numba>=0.49 in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (0.56.4)
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.9.tar.gz (1.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.1/1.1 MB 53.3 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (4.65.0)
Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.5.3)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (3.7.1)
Collecting datashader
  Downloading datashader-0.14.4-py2.py3-none-any.whl (18.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.2/18.2 MB 75.5 MB/s eta 0:00:00
Requirement already satisfied: bokeh in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (2.4.3)
Requirement already satisfied: holoviews in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.15.4)
Requirement already satisfied: colorcet in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (3.0.1)
Requirement already satisfied: seaborn in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (0.12.2)
Requirement already satisfied: scikit-image in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (0.19.3)
Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from numba>=0.49->umap-learn[plot]) (67.6.1)
Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.9/dist-packages (from numba>=0.49->umap-learn[plot]) (0.39.1)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.9/dist-packages (from pynndescent>=0.5->umap-learn[plot]) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.22->umap-learn[plot]) (3.1.0)
Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (23.1)
Requirement already satisfied: tornado>=5.1 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (6.2)
Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (6.0)
Requirement already satisfied: typing-extensions>=3.10.0 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (4.5.0)
Requirement already satisfied: pillow>=7.1.0 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (8.4.0)
Requirement already satisfied: Jinja2>=2.9 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (3.1.2)
Requirement already satisfied: pyct>=0.4.4 in /usr/local/lib/python3.9/dist-packages (from colorcet->umap-learn[plot]) (0.5.0)
Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (2.27.1)
Requirement already satisfied: toolz in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (0.12.0)
Requirement already satisfied: param in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (1.13.0)
Requirement already satisfied: xarray in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (2022.12.0)
Collecting datashape
  Downloading datashape-0.5.2.tar.gz (76 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 76.5/76.5 kB 7.7 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Requirement already satisfied: dask in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (2022.12.1)
Requirement already satisfied: panel>=0.13.1 in /usr/local/lib/python3.9/dist-packages (from holoviews->umap-learn[plot]) (0.14.4)
Requirement already satisfied: pyviz-comms>=0.7.4 in /usr/local/lib/python3.9/dist-packages (from holoviews->umap-learn[plot]) (2.2.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->umap-learn[plot]) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas->umap-learn[plot]) (2022.7.1)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (3.0.9)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (4.39.3)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (1.0.7)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (1.4.4)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (0.11.0)
Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (5.12.0)
Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-image->umap-learn[plot]) (1.4.1)
Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.9/dist-packages (from scikit-image->umap-learn[plot]) (2.25.1)
Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.9/dist-packages (from scikit-image->umap-learn[plot]) (3.1)
Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.9/dist-packages (from scikit-image->umap-learn[plot]) (2023.4.12)
Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib->umap-learn[plot]) (3.15.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from Jinja2>=2.9->bokeh->umap-learn[plot]) (2.1.2)
Requirement already satisfied: bleach in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews->umap-learn[plot]) (6.0.0)
Requirement already satisfied: markdown in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews->umap-learn[plot]) (3.4.3)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->umap-learn[plot]) (1.16.0)
Requirement already satisfied: fsspec>=0.6.0 in /usr/local/lib/python3.9/dist-packages (from dask->datashader->umap-learn[plot]) (2023.4.0)
Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.9/dist-packages (from dask->datashader->umap-learn[plot]) (8.1.3)
Requirement already satisfied: cloudpickle>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from dask->datashader->umap-learn[plot]) (2.2.1)
Requirement already satisfied: partd>=0.3.10 in /usr/local/lib/python3.9/dist-packages (from dask->datashader->umap-learn[plot]) (1.4.0)
Requirement already satisfied: multipledispatch>=0.4.7 in /usr/local/lib/python3.9/dist-packages (from datashape->datashader->umap-learn[plot]) (0.6.0)
Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->datashader->umap-learn[plot]) (2.0.12)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->datashader->umap-learn[plot]) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->datashader->umap-learn[plot]) (1.26.15)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->datashader->umap-learn[plot]) (3.4)
Requirement already satisfied: locket in /usr/local/lib/python3.9/dist-packages (from partd>=0.3.10->dask->datashader->umap-learn[plot]) (1.0.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.9/dist-packages (from bleach->panel>=0.13.1->holoviews->umap-learn[plot]) (0.5.1)
Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.9/dist-packages (from markdown->panel>=0.13.1->holoviews->umap-learn[plot]) (6.4.1)
Building wheels for collected packages: pynndescent, umap-learn, datashape
  Building wheel for pynndescent (setup.py) ... done
  Created wheel for pynndescent: filename=pynndescent-0.5.9-py3-none-any.whl size=55620 sha256=48de31d05c1c64c9016e25ff61046b510e0bdbf6b89f4437e758d0ec73d3b451
  Stored in directory: /root/.cache/pip/wheels/eb/f2/e3/b8e73d1488d8284d88c9283411561b65bd4f0200abf131a946
  Building wheel for umap-learn (setup.py) ... done
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82830 sha256=118f2bf9d535cd8db9192685403e67da93706bd30ee5c651f96996bc824602c1
  Stored in directory: /root/.cache/pip/wheels/f4/3e/1c/596d0a463d17475af648688443fa4846fef624d1390339e7e9
  Building wheel for datashape (setup.py) ... done
  Created wheel for datashape: filename=datashape-0.5.2-py3-none-any.whl size=59436 sha256=61e9c0317eac03f5ec9f2e7586be577a4ad369947ec69050f65bff6f442d9728
  Stored in directory: /root/.cache/pip/wheels/42/ef/d7/781cf80d4146d76b3d2ed2510113d78c2643c842cc6c22918d
Successfully built pynndescent umap-learn datashape
Installing collected packages: datashape, pynndescent, umap-learn, datashader
Successfully installed datashader-0.14.4 datashape-0.5.2 pynndescent-0.5.9 umap-learn-0.5.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: holoviews in /usr/local/lib/python3.9/dist-packages (1.15.4)
Requirement already satisfied: pyviz-comms>=0.7.4 in /usr/local/lib/python3.9/dist-packages (from holoviews) (2.2.1)
Requirement already satisfied: panel>=0.13.1 in /usr/local/lib/python3.9/dist-packages (from holoviews) (0.14.4)
Requirement already satisfied: param<2.0,>=1.9.3 in /usr/local/lib/python3.9/dist-packages (from holoviews) (1.13.0)
Requirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from holoviews) (23.1)
Requirement already satisfied: pandas>=0.20.0 in /usr/local/lib/python3.9/dist-packages (from holoviews) (1.5.3)
Requirement already satisfied: numpy>=1.0 in /usr/local/lib/python3.9/dist-packages (from holoviews) (1.22.4)
Requirement already satisfied: colorcet in /usr/local/lib/python3.9/dist-packages (from holoviews) (3.0.1)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=0.20.0->holoviews) (2022.7.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=0.20.0->holoviews) (2.8.2)
Requirement already satisfied: markdown in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (3.4.3)
Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (2.27.1)
Requirement already satisfied: bokeh<2.5.0,>=2.4.0 in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (2.4.3)
Requirement already satisfied: pyct>=0.4.4 in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (0.5.0)
Requirement already satisfied: bleach in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (6.0.0)
Requirement already satisfied: setuptools>=42 in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (67.6.1)
Requirement already satisfied: tqdm>=4.48.0 in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (4.65.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (4.5.0)
Requirement already satisfied: pillow>=7.1.0 in /usr/local/lib/python3.9/dist-packages (from bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (8.4.0)
Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.9/dist-packages (from bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (6.0)
Requirement already satisfied: Jinja2>=2.9 in /usr/local/lib/python3.9/dist-packages (from bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (3.1.2)
Requirement already satisfied: tornado>=5.1 in /usr/local/lib/python3.9/dist-packages (from bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (6.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas>=0.20.0->holoviews) (1.16.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.9/dist-packages (from bleach->panel>=0.13.1->holoviews) (0.5.1)
Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.9/dist-packages (from markdown->panel>=0.13.1->holoviews) (6.4.1)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->panel>=0.13.1->holoviews) (2022.12.7)
Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->panel>=0.13.1->holoviews) (2.0.12)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->panel>=0.13.1->holoviews) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->panel>=0.13.1->holoviews) (1.26.15)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.9/dist-packages (from importlib-metadata>=4.4->markdown->panel>=0.13.1->holoviews) (3.15.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from Jinja2>=2.9->bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (2.1.2)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: ipykernel in /usr/local/lib/python3.9/dist-packages (5.5.6)
Collecting ipykernel
  Downloading ipykernel-6.22.0-py3-none-any.whl (149 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.0/150.0 kB 9.8 MB/s eta 0:00:00
Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (5.3.0)
Requirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from ipykernel) (23.1)
Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.9/dist-packages (from ipykernel) (1.5.6)
Requirement already satisfied: matplotlib-inline>=0.1 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (0.1.6)
Requirement already satisfied: psutil in /usr/local/lib/python3.9/dist-packages (from ipykernel) (5.9.5)
Requirement already satisfied: ipython>=7.23.1 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (7.34.0)
Collecting comm>=0.1.1
  Downloading comm-0.1.3-py3-none-any.whl (6.6 kB)
Requirement already satisfied: debugpy>=1.6.5 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (1.6.6)
Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (6.1.12)
Requirement already satisfied: traitlets>=5.4.0 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (5.7.1)
Requirement already satisfied: tornado>=6.1 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (6.2)
Requirement already satisfied: pyzmq>=20 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (23.2.1)
Requirement already satisfied: backcall in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (0.2.0)
Requirement already satisfied: decorator in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (4.4.2)
Collecting jedi>=0.16
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 71.3 MB/s eta 0:00:00
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (3.0.38)
Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (67.6.1)
Requirement already satisfied: pickleshare in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (0.7.5)
Requirement already satisfied: pygments in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (2.14.0)
Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (4.8.0)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.9/dist-packages (from jupyter-client>=6.1.12->ipykernel) (2.8.2)
Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.9/dist-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel) (3.2.0)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.9/dist-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel) (0.8.3)
Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.9/dist-packages (from pexpect>4.3->ipython>=7.23.1->ipykernel) (0.7.0)
Requirement already satisfied: wcwidth in /usr/local/lib/python3.9/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=7.23.1->ipykernel) (0.2.6)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.1->jupyter-client>=6.1.12->ipykernel) (1.16.0)
Installing collected packages: jedi, comm, ipykernel
  Attempting uninstall: ipykernel
    Found existing installation: ipykernel 5.5.6
    Uninstalling ipykernel-5.5.6:
      Successfully uninstalled ipykernel-5.5.6
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires ipykernel~=5.5.6, but you have ipykernel 6.22.0 which is incompatible.
Successfully installed comm-0.1.3 ipykernel-6.22.0 jedi-0.18.2
In [2]:
#feature selection and model interpretaiton
!pip install shap
import shap
!pip install eli5
import eli5
from eli5.sklearn import PermutationImportance
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 572.4/572.4 kB 21.3 MB/s eta 0:00:00
Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from shap) (1.22.4)
Requirement already satisfied: numba in /usr/local/lib/python3.9/dist-packages (from shap) (0.56.4)
Requirement already satisfied: cloudpickle in /usr/local/lib/python3.9/dist-packages (from shap) (2.2.1)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.9/dist-packages (from shap) (1.2.2)
Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from shap) (1.5.3)
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Requirement already satisfied: tqdm>4.25.0 in /usr/local/lib/python3.9/dist-packages (from shap) (4.65.0)
Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from shap) (1.10.1)
Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.9/dist-packages (from shap) (23.1)
Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from numba->shap) (67.6.1)
Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.9/dist-packages (from numba->shap) (0.39.1)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas->shap) (2022.7.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->shap) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn->shap) (3.1.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn->shap) (1.2.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->shap) (1.16.0)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 216.2/216.2 kB 14.3 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Requirement already satisfied: attrs>17.1.0 in /usr/local/lib/python3.9/dist-packages (from eli5) (23.1.0)
Requirement already satisfied: jinja2>=3.0.0 in /usr/local/lib/python3.9/dist-packages (from eli5) (3.1.2)
Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.9/dist-packages (from eli5) (1.22.4)
Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from eli5) (1.10.1)
Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from eli5) (1.16.0)
Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.9/dist-packages (from eli5) (1.2.2)
Requirement already satisfied: graphviz in /usr/local/lib/python3.9/dist-packages (from eli5) (0.20.1)
Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.9/dist-packages (from eli5) (0.8.10)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2>=3.0.0->eli5) (2.1.2)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20->eli5) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20->eli5) (3.1.0)
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... done
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107747 sha256=a3cccdc9345c994f91535f1229c707a493f5bb685d3b36589f336571ac0bd518
  Stored in directory: /root/.cache/pip/wheels/7b/26/a5/8460416695a992a2966b41caa5338e5e7fcea98c9d032d055c
Successfully built eli5
Installing collected packages: eli5
Successfully installed eli5-0.13.0
In [3]:
!pip install vecstack
from vecstack import stacking



## plot settings

sns.set_style('white')
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['axes.spines.left'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = False
plt.rcParams.update({'font.size':14})
plt.rcParams['font.weight']= 'normal'

    
print(color_class.BOLD + 'Done!!')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vecstack
  Downloading vecstack-0.4.0.tar.gz (18 kB)
  Preparing metadata (setup.py) ... done
Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from vecstack) (1.22.4)
Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from vecstack) (1.10.1)
Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.9/dist-packages (from vecstack) (1.2.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.18->vecstack) (3.1.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.18->vecstack) (1.2.0)
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... done
  Created wheel for vecstack: filename=vecstack-0.4.0-py3-none-any.whl size=19879 sha256=23693121a7841649be4d9ba5ab6b9448fae447972c90dd52e5b21d5061ebb160
  Stored in directory: /root/.cache/pip/wheels/7e/ee/d6/47cb94a403bc544de1433986e5530d6b0498021098fbe43aa1
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0
Done!!
In [8]:
import pandas as pd
In [9]:
from google.colab import files
uploaded = files.upload()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving breast-cancer-wisconsin.names to breast-cancer-wisconsin (1).names
In [14]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
colors= ['#e5b6f9' ,'#6fb3a9' ,'#eda760' ,'#c6e699' ,'#fff4af','#000000']

sns.palplot(colors,size = 3)

plt.gcf().set_size_inches(15,5)

plt.text(-0.75,-0.75, 'Women and Cancer: Color Palette',{'fontfamily':'serif', 'size':24, 'weight':'bold'})
plt.text(-0.75,-0.68, 'Lets try to stick to these colors throughout presentation.',{'fontfamily':'serif', 'size':16},alpha = 0.9)
for idx,values in enumerate(colors):
    plt.text(idx-0.25,0, colors[idx],{'fontfamily':'serif', 'size':16, 'weight':'bold','color':'black'}, alpha =0.8)
plt.gcf().set_facecolor('white')
plt.box(None)
plt.axis('off')
plt.text(3.5,0.65,'© Made by Milon',{'fontfamily':'serif', 'size':10,  'color':'black'})
plt.show()
In [ ]:
# Null accuracy Score for current data
NUll_acc = round (max(df.diagnosis.values.mean(), 1 - df.diagnosis.values.mean()), 2)

print(color_class.BOLD +'\nNull Accuracy Score: '+ color_class.END \
      +color_class.OKGREEN  + str(NUll_acc) + color_class.END + '\n' )

print(color_class.OKGREEN  + 'This is the Baseline our model need to cross.\n'+ color_class.END)

Null Accuracy Score: 0.63

This is the Baseline our model need to cross.

In [ ]:
feat_df = df.drop(columns = ['id', 'diagnosis'])
tar_df = df['diagnosis']
cancer_dist = round(tar_df.value_counts(normalize = True),2)*100

fig = plt.figure(FigureClass = Waffle, 
                 constrained_layout = True,
                 figsize = (8,5),
                 facecolor = 'white',dpi = 100,
                 
                 plots = {'111':
                          {     
                           'rows':10,
                           'columns': 10,
                           'values' : [cancer_dist.values[0],cancer_dist.values[1]],
                            'colors' : [colors[1],colors[0]],
                              'vertical' : True,
                              'interval_ratio_y': 0.2,
                              'interval_ratio_x': 0.2,
                              'icon_legend': False,
                              'icon_size':5,
                              'plot_anchor':'C',
                              
                          },
                       
                         })

## labeling 
fig.text(0.36,0.725, '{}%'.format(cancer_dist.values[1]), {'fontfamily':'serif','size':20, 'weight':'bold', 'color':colors[5]})
fig.text(0.625,0.36, '{}%'.format(cancer_dist.values[0]),{'fontfamily':'serif','size':20, 'weight':'bold','color':colors[5]})

## titles and text
#fig.text(-0.1,1.035,'Women and Cancer: How Susceptable Are Women To Breast Cancer?', {'font':'serif','size':18, 'weight':'bold'}, alpha = 1)
#fig.text(-0.1,0.96,'''Its really sad to see nearly 40% of the women are suceptable to cancer.
#Lets hope things will change with medical advancements.''',{'font':'serif','size':12, 'weight':'normal'}, alpha = 0.9)

#fig.text(0.75,0.50, "Cancerous",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[1]})
#fig.text(0.85,0.95, '|',{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[3]})
fig.text(0.85,0.70, "Healthy",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})
fig.text(0.85,0.50, "Cancerous",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[1]})

fig.text(0.82,0.1,'© Made by Milon',{'fontfamily':'serif', 'size':8,'weight':'bold'}, alpha = 0.7)

fig.show()
In [ ]:
fig,ax = plt.subplots(nrows = 10, ncols = 3, figsize = (12,24),dpi=80)
#fig.patch.set_facecolor(colors[-1])
axes = ax.ravel()

for col,ax in zip(feat_df.columns,axes):
    
    # skewness and kurtosis
    if skew(feat_df[col])>1:
        color = colors[0]
    else:
        color = colors[1]
    
    ## plots
    #sns.kdeplot(feat_df[col], ax= ax, fill = True , color = color, alpha = 1, linewidth = 3, ec = 'black')
    sns.violinplot(feat_df[col], ax =ax, 
                   color = color, cut =0,
                   inner = 'box',
                   alpha = 1,linewidth = 3, edgecolor = 'solidblack', saturation =1 )
    
    ## plot setting
    xlabel = ' '.join([value.capitalize() for value in str(col).split('_') ])
    #ax.set_facecolor(colors[-1])
    ax.axes.get_yaxis().set_visible(False)
    ax.axes.set_xlabel(xlabel,{'fontfamily':'serif','size':14, 'weight':'bold'}, alpha = 1)
   

    
plt.tight_layout(pad= 3,h_pad = 2.5, w_pad = 2.5)


## titles and text
#fig.text(0,1.05,'Women and Cancer: Overview of Univariate Feature Distribution', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(0,1.02,'''Are there any normally distributed features? It seems most of the features
#are skewed and having high kurtosis, may be a log somekind transformation needed. It seems
#most of the se features and fractual Dimensions have outliers.  ''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 1)

fig.text(0.65,1, "Skewed",{'fontfamily':'serif','size':16, 'weight':'bold', 'color':colors[0]})
fig.text(0.73,1, '|',{'fontfamily':'serif','size':16, 'weight':'bold'})
fig.text(0.74,1, "Relative Normal",{'fontfamily':'serif','size':16, 'weight':'bold','color':colors[1]})

fig.text(0.73,0,'© Made by bhuvanchennoju/Kaggle',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)

fig.show()
In [ ]:
fig,ax = plt.subplots(nrows = 10, ncols = 3, figsize = (12,24),dpi=80)
#fig.patch.set_facecolor(colors[-1])
axes = ax.ravel()

for col,ax in zip(feat_df.columns,axes):
    
    ## plots
    
    sns.kdeplot(df[col], ax = ax, shade = True ,
                palette = [colors[0], colors[2]],
                alpha = 0.95, linewidth = 3, ec = 'black',
                hue = df['diagnosis'], hue_order = [1,0],
                legend = False)
    
    ## plot setting
    xlabel = ' '.join([value.capitalize() for value in str(col).split('_') ])
    #ax.set_facecolor(colors[-1])
    ax.axes.get_yaxis().set_visible(False)
    ax.axes.set_xlabel(xlabel,{'fontfamily':'serif','size':14, 'weight':'bold'}, alpha = 1)
   

    
plt.tight_layout(pad= 3,h_pad = 1.5, w_pad = 1.5)


## titles and text
#fig.text(0,1.03,'Women and Cancer: Distribution of Cancers cells on Feature level', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(0,1.01,'''It seems most of the features and targets have similar kind of ditribution, but few
#target distributions are morelike normal distribution.''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 1)

fig.text(0.615,1, "Cancerous",{'fontfamily':'serif','size':16, 'weight':'bold', 'color':colors[0]}, alpha = 1)
fig.text(0.73,1, '|',{'fontfamily':'serif','size':16, 'weight':'bold'})
fig.text(0.74,1, "Healthy",{'fontfamily':'serif','size':16, 'weight':'bold','color':colors[2]}, alpha = 1)

fig.text(0.73,0,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)

fig.show()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-2346bf32333a> in <cell line: 5>()
      7     ## plots
      8 
----> 9     sns.kdeplot(df[col], ax = ax, shade = True ,
     10                 palette = [colors[0], colors[2]],
     11                 alpha = 0.95, linewidth = 3, ec = 'black',

/usr/local/lib/python3.9/dist-packages/seaborn/distributions.py in kdeplot(data, x, y, hue, weights, palette, hue_order, hue_norm, color, fill, multiple, common_norm, common_grid, cumulative, bw_method, bw_adjust, warn_singular, log_scale, levels, thresh, gridsize, cut, clip, legend, cbar, cbar_ax, cbar_kws, ax, **kwargs)
   1683     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
   1684 
-> 1685     p = _DistributionPlotter(
   1686         data=data,
   1687         variables=_DistributionPlotter.get_semantics(locals()),

/usr/local/lib/python3.9/dist-packages/seaborn/distributions.py in __init__(self, data, variables)
    111     ):
    112 
--> 113         super().__init__(data=data, variables=variables)
    114 
    115     @property

/usr/local/lib/python3.9/dist-packages/seaborn/_oldcore.py in __init__(self, data, variables)
    638         # information for numeric axes would be information about log scales.
    639         self._var_ordered = {"x": False, "y": False}  # alt., used DefaultDict
--> 640         self.assign_variables(data, variables)
    641 
    642         for var, cls in self._semantic_mappings.items():

/usr/local/lib/python3.9/dist-packages/seaborn/_oldcore.py in assign_variables(self, data, variables)
    694         if x is None and y is None:
    695             self.input_format = "wide"
--> 696             plot_data, variables = self._assign_variables_wideform(
    697                 data, **variables,
    698             )

/usr/local/lib/python3.9/dist-packages/seaborn/_oldcore.py in _assign_variables_wideform(self, data, **kwargs)
    743             err = f"The following variable{s} cannot be assigned with wide-form data: "
    744             err += ", ".join(f"`{v}`" for v in assigned)
--> 745             raise ValueError(err)
    746 
    747         # Determine if the data object actually has any data in it

ValueError: The following variable cannot be assigned with wide-form data: `hue`
In [ ]:
print(color_class.BOLD_COLOR + '\nSegregating Features Based On Category....\n' + color_class.END)

### measurement and characteristics keyword lists
measure_keyword = ['radius','perimeter','area','concavity', 'concave points']
character_keyword = ['texture','smoothness','compactness','symmetry','fractal']

### mean, standard error, and worst measure feature lists
mean_measure, mean_character = ['diagnosis'],['diagnosis']
se_measure, se_character = ['diagnosis'],['diagnosis']
worst_measure,worst_character = ['diagnosis'],['diagnosis']

### requrired mean, standard errror,and worst measure feature creating loop
for col in feat_df.columns:
    
    name_list = str(col).split('_')
    
    if name_list[0] in measure_keyword:
        if 'mean' in name_list:
            mean_measure.append(col)
            
        elif 'se' in name_list:
            se_measure.append(col)
            
        else:
            worst_measure.append(col)           
            
    if name_list[0] in character_keyword:
        if 'mean' in name_list:
            mean_character.append(col)
        elif 'se' in name_list:
            se_character.append(col)
        else:
            worst_character.append(col) 
            
###### descriptions and lists            
print(color_class.BOLD + 'Done!' +color_class.END)
print(color_class.BOLD_COLOR + '\nSeperated Features are stored into lists:\n' +color_class.END)
print(color_class.BOLD_COLOR + 'Mean of Measurements: '+color_class.END \
      + color_class.BOLD +str(' , '.join(mean_measure[1:])) + color_class.END +'\n')
print(color_class.BOLD_COLOR + 'Mean of Characteristics: '+color_class.END \
      + color_class.BOLD +str(' , '.join(mean_character[1:])) + color_class.END  +'\n')
print(color_class.BOLD_COLOR + 'Standard Error of Measurements: '+color_class.END \
      + color_class.BOLD +str(' , '.join(se_measure[1:])) + color_class.END  +'\n')
print(color_class.BOLD_COLOR + 'Standard Error of Characteristics: '+color_class.END \
      + color_class.BOLD +str(' , '.join(se_character[1:])) + color_class.END  +'\n')
print(color_class.BOLD_COLOR + 'Worst of Measurements: '+color_class.END \
      + color_class.BOLD +str(' , '.join(worst_measure[1:])) + color_class.END  +'\n')
print(color_class.BOLD_COLOR + 'Worst of Characteristics: '+color_class.END \
      + color_class.BOLD +str(' , '.join(worst_character[1:])) + color_class.END  +'\n')

Segregating Features Based On Category....

Done!

Seperated Features are stored into lists:

Mean of Measurements: radius_mean , perimeter_mean , area_mean , concavity_mean , concave points_mean

Mean of Characteristics: texture_mean , smoothness_mean , compactness_mean , symmetry_mean , fractal_dimension_mean

Standard Error of Measurements: radius_se , perimeter_se , area_se , concavity_se , concave points_se

Standard Error of Characteristics: texture_se , smoothness_se , compactness_se , symmetry_se , fractal_dimension_se

Worst of Measurements: radius_worst , perimeter_worst , area_worst , concavity_worst , concave points_worst

Worst of Characteristics: texture_worst , smoothness_worst , compactness_worst , symmetry_worst , fractal_dimension_worst

In [15]:
print(color_class.BOLD_COLOR +'\nFinally helper function to visualize bivariate features....\n' + color_class.END)
    
### bivariate cross relations visualizations function
            
def cust_pairplot(df,var,title, diag_kind = 'kde',corner = True,sign = 'off'):
    
    ## plot
    g = sns.pairplot(data = df[var],
                 hue= 'diagnosis',hue_order = [1,0],
                 height = 2.5,aspect = 1,
                 corner = True, diag_kind= diag_kind, 
                 palette = [colors[0],colors[2]], 
                 plot_kws = {'alpha':1, 'size' : 1, 'linewidth' : 0.5, 'ec':'black'},
                 diag_kws = {'alpha':0.95,'ec':'black','linewidth':3 });
    
    ### plot setting
    g._legend.remove();
    
    plt.gcf().patch.set_facecolor('white');
    plt.gcf().patch.set_alpha(1)
    plt.gcf().set_size_inches(12,12);
    #plt.gcf().set_dpi(55);
    
    for ax in plt.gcf().axes:
        ax.set_facecolor('white')
        for loc in ['left','right','top','bottom']:
            ax.spines[loc].set_visible(False)
        #ax.set_xticks(ticks = [])
        #ax.set_yticks(ticks = [])
        ax.set_xlabel(xlabel = ax.get_xlabel(), **{'fontfamily':'serif', 'size':12,'weight':'bold'}, alpha = 1)
        ax.set_ylabel(ylabel = ax.get_ylabel(), **{'fontfamily':'serif', 'size':12,'weight':'bold'}, rotation  = 90,alpha = 1)

    ### titles and descriptions

   # plt.gcf().text(0.425,0.85, 'Women and Cancer:\n{}'.format(title),{'fontfamily':'serif', 'size':22.,'weight':'bold'}, alpha = 1)


   # plt.gcf().text(0.425,0.8,'''This visualization shows the bivariate relations among \nthe {}.'''.format(title),{'fontfamily':'serif', 'size':14}, alpha = 1)

    plt.gcf().text(0.44,0.75, "Cancerous",{'fontfamily':'serif','size':18, 'weight':'bold', 'color':colors[0]}, alpha = 1)
    plt.gcf().text(0.565,0.75, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
    plt.gcf().text(0.575,0.75, "Healthy",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[2]}, alpha = 1)

    ## legend
    if sign == 'on':
        plt.gcf().text(0.75,-0.025,'© Made by Milon',{'fontfamily':'serif', 'size':12, 'weight':'bold'},alpha = 1)
    
    plt.gca().margins(x =0)
    plt.gcf().show();

Finally helper function to visualize bivariate features....

In [ ]:
cust_pairplot(df, mean_character, 'Mean Characteristics of Cancer Cells', sign = 'on')
In [ ]:
cust_pairplot(df, se_measure, 'SE in Measurements of Cancer Cells',sign = 'on')
In [ ]:
cust_pairplot(df, se_character, 'SE in Characteristics of Cancer Cells',sign = 'on')
In [ ]:
cust_pairplot(df, se_character, 'SE in Characteristics of Cancer Cells',sign = 'on')
In [ ]:
cust_pairplot(df, worst_character, 'Worst of Characteristics of Cancer Cells', sign = 'on')
In [ ]:
print(color_class.BOLD_COLOR + '\nGetting High Positively Correatated and Negatively Coorealated Features \nfrom cross categorical feature extraction..\n' + color_class.END)

temp_df = df.corr().unstack().reset_index()


### cross relational positive features

positive_corr_df = (temp_df[(temp_df[0]>0.9) &
         (temp_df['level_0'] != temp_df['level_1']) & 
         ((temp_df['level_0'].apply(lambda x: str(x).split('_')[-1])) != (temp_df['level_1'].apply(lambda x: str(x).split('_')[-1])))])

positive_corr_df['z'] = positive_corr_df.apply(lambda x: tuple(sorted([x['level_0'],x['level_1']])), axis = 1)
positive_corr_df.drop_duplicates(subset="z", keep="first" , inplace = True ) 
positive_corr_df.drop(columns = ['z'], inplace = True)



### cross relational negative features

negative_corr_df = (temp_df[(temp_df[0]<-0.2) &
         (temp_df['level_0'] != temp_df['level_1']) & 
         ((temp_df['level_0'].apply(lambda x: str(x).split('_')[-1])) != (temp_df['level_1'].apply(lambda x: str(x).split('_')[-1])))])

negative_corr_df['z'] = negative_corr_df.apply(lambda x: tuple(sorted([x['level_0'],x['level_1']])), axis = 1)
negative_corr_df.drop_duplicates(subset="z", keep="first" , inplace = True ) 
negative_corr_df.drop(columns = ['z'], inplace = True)

print(color_class.BOLD +'Done!')

Getting High Positively Correatated and Negatively Coorealated Features 
from cross categorical feature extraction..

Done!
In [ ]:
print(color_class.BOLD_COLOR + '\nHelper function to visualize the cross categorical Feature analysis\n'+ color_class.END)
def plot_cross_scatter(corr_df, data =df,title = None,des = None,nrows = 4, ncols = 3, figsize = (12,24), colors = colors):
    
    col1_list = corr_df['level_0'].values.tolist()
    col2_list = corr_df['level_1'].values.tolist()
    
    ## plotting
    fig,axes = plt.subplots(nrows,ncols, figsize = (15,20))
    
    # removing the last axes
    axes.ravel()[-1].axes.get_xaxis().set_visible(False)
    axes.ravel()[-1].axes.get_yaxis().set_visible(False)
    
    for ax,col1,col2 in zip(axes.ravel(), col1_list,col2_list):
        
        sns.scatterplot(x= data[col1], y = data[col2], ax = ax,size = 100, 
                        linewidth= 0.5, edgecolor = 'black',
                        hue = data['diagnosis'], hue_order = [1,0],
                        palette = [colors[0],colors[2]], legend = False )
        
        ## plot setting
        xlabel = ' '.join([value.capitalize() for value in str(col1).split('_') ])
        ylabel = ' '.join([value.capitalize() for value in str(col2).split('_') ])
        
        ax.axes.set_xlabel(xlabel,{'fontfamily':'serif','size':14, 'weight':'bold'}, alpha = 1)
        ax.axes.set_ylabel(ylabel,{'fontfamily':'serif','size':14, 'weight':'bold'}, alpha = 1) 
        
        ax.set_xticklabels('')
        ax.set_yticklabels('')
        
    
    ## titles and text
    #fig.text(0.05,0.935,'Women and Cancer: {}'.format(title), {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
    fig.text(0.05,0.91,'''{}'''.format(des),{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 1)

    fig.text(0.63,0.885, "Cancerous",{'fontfamily':'serif','size':16, 'weight':'bold', 'color':colors[0]}, alpha = 1)
    fig.text(0.735,0.885, '|',{'fontfamily':'serif','size':16, 'weight':'bold'})
    fig.text(0.745,0.885, "Healthy",{'fontfamily':'serif','size':16, 'weight':'bold','color':colors[2]}, alpha = 1)

    fig.text(0.73,0.1,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)

    fig.show()
    
    return None

Helper function to visualize the cross categorical Feature analysis

In [ ]:
des = 'Here we are seeing the cancer cell features which are highly correlated with each other and belong to different category. \nIt seems we have multi-colinear features and they are passing similar information, and This could alter the predictions.'
plot_cross_scatter(positive_corr_df, title = 'CrossCategorical Positively Related Features', des = des)
In [ ]:
des = 'Here we are seeing the cancer cell features which are moderately correlated with each other and belong to different category. \nIt seems we have multi-colinear features and they are passing similar information, and This could alter the predictions.'
plot_cross_scatter(negative_corr_df,nrows = 4,ncols = 2, figsize=(12,6)
                   ,title = 'CrossCategorical Negitively Correlated Features', des = des)
In [ ]:
temp = df.copy()

X_temp = temp.drop(columns = ['id','diagnosis'])
y_temp = temp['diagnosis']

# fitting on umap
umap = UMAP(random_state=2021)
model_umap = umap.fit_transform(X_temp, y_temp)

fig,ax = plt.subplots(figsize=(7,7),dpi =80)

# plots
ax.scatter(model_umap[temp['diagnosis'] == 0][:,0], model_umap[temp['diagnosis'] == 0][:,1], c= colors[2], alpha=1,s=50, linewidth = 1, ec = 'black')
ax.scatter(model_umap[temp['diagnosis'] == 1][:,0], model_umap[temp['diagnosis'] == 1][:,1], c= colors[0], alpha=1,s=50, linewidth = 1, ec = 'black')


## titles and text

ax.set_xticklabels('')
ax.set_yticklabels('')

fig.text(0,1.01,' Dimensionality Reduction with UMAP', {'fontfamily':'serif','size':18, 'weight':'bold'}, alpha = 1)
#fig.text(0,0.95,'''Wow! As data is very less clear clustering of cancer cells can
#be seen. There are clearly seperable and hope get good results...''',{'font':'serif','size':13, 'weight':'normal'}, alpha = 0.95)

fig.text(0.68,0.85, "Cancerous",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[0]})
fig.text(0.85,0.85, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.87,0.85, "Healthy",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})

fig.text(0.65,0.05,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)


fig.show()
In [ ]:
print(color_class.BOLD_COLOR + '\nOutlier Removal,Skewness, Kurtosis helper funtions are here....\n'+ color_class.END)

def outlier_detect(algo,data):
    cols = data.drop(columns = ['id']).columns
    # creating feature and target numpy arrays
    feat, tar = data[cols].drop(columns = 'diagnosis').values, data['diagnosis'].values
    # fitting the features to algo
    yhat = algo.fit_predict(feat)
    # masking the features that are not outliers
    mask = yhat != -1
    X,y = feat[mask,:], tar[mask]
    data_inarray = np.append(y.reshape(-1,1),X,axis = 1)
    return pd.DataFrame(data = data_inarray, columns = cols)

def skew_sum(data):
    return skew(data).sum()

def kurtosis_sum(data):
    return kurtosis(data).sum()

def shape(data): 
    return data.shape

Outlier Removal,Skewness, Kurtosis helper funtions are here....

In [ ]:
print(color_class.BOLD_COLOR+'\nOutliers related information with isolation forest, elliptic envelope, localoutlierfactor, dbscan is storing to  dataframes....' + color_class.END)

outlier_algos = [IsolationForest(contamination = 0.05),\
                 EllipticEnvelope(contamination = 0.05),\
                 LocalOutlierFactor(contamination = 0.05), \
                 DBSCAN(eps = 70, min_samples = 10)]


df_list = [df.drop(columns = ['id'])]
shapes = [df.drop(columns = ['id']).shape[0]]
skews = [skew(df.drop(columns = ['id']))]
kurts = [kurtosis(df.drop(columns = ['id']))]

for algo in outlier_algos:
    corrected_df = outlier_detect(algo, df)
    df_list.append(corrected_df)
    shapes.append(corrected_df.shape[0])
    skews.append(skew(corrected_df))
    kurts.append(kurtosis(corrected_df))
        

algorithms = ['Original','IsolationForest', 'EllipticEnvelope', 'LocalOutlierFactor', 'DBSCAN']
outliers_info = pd.DataFrame({'algorithms':algorithms,'df_list':df_list,'shapes':shapes, 'skews':skews, 'kurts': kurts})

outliers_info['skews_sum'] = outliers_info['skews'].apply(lambda x: round(x.sum(),2))
outliers_info['kurts_sum'] = outliers_info['kurts'].apply(lambda x: round(x.sum(),2))
outliers_info.sort_values(by = 'shapes').reset_index(drop = True, inplace = True)


for idx, df_ in enumerate(outliers_info['df_list']):
    from sklearn.metrics import f1_score
    
    lr = LinearRegression()
    X = df_.drop(columns = ['diagnosis'])
    y = df_['diagnosis']
    xtrain, xtest,ytrain,ytest = train_test_split(X,y,test_size = 0.2)
    
    # linear regression
    preds = LinearRegression().fit(xtrain.values,ytrain.values).predict(xtest.values)
        
    r2 = round(r2_score(ytest,preds),3)
    #ypred_class = preds > 0.85
    #acc = round(accuracy_score(ytest, ypred_class),3)
    #roc_auc = round(roc_auc_score(ytest,ypred_class),3)
       
    metric_list = r2
    
    outliers_info.loc[idx, 'r2_score'] = metric_list
    


print(color_class.BOLD + '\nAll the corrected data stored to' + color_class.END \
      + color_class.BOLD_COLOR + str(' Outliers_info DataFrame\n')+ color_class.END)

print(color_class.BOLD)
print(outliers_info.T)

Outliers related information with isolation forest, elliptic envelope, localoutlierfactor, dbscan is storing to  dataframes....

All the corrected data stored to Outliers_info DataFrame


                                                            0  \
algorithms                                           Original   
df_list          diagnosis  radius_mean  texture_mean  per...   
shapes                                                    569   
skews       [0.5270671676029054, 0.9398934455576345, 0.648...   
kurts       [-1.722200200835051, 0.8275836739140465, 0.741...   
skews_sum                                                52.6   
kurts_sum                                              230.35   
r2_score                                                0.761   

                                                            1  \
algorithms                                    IsolationForest   
df_list          diagnosis  radius_mean  texture_mean  per...   
shapes                                                    540   
skews       [0.6375196739103915, 0.6679465471669559, 0.708...   
kurts       [-1.593568665377198, -0.08226351117536046, 0.9...   
skews_sum                                                34.3   
kurts_sum                                               61.37   
r2_score                                                0.749   

                                                            2  \
algorithms                                   EllipticEnvelope   
df_list          diagnosis  radius_mean  texture_mean  per...   
shapes                                                    540   
skews       [0.6461002842069277, 0.7240168357232661, 0.672...   
kurts       [-1.5825544227477208, 0.12872870717619955, 0.8...   
skews_sum                                                37.5   
kurts_sum                                               75.98   
r2_score                                                 0.72   

                                                            3  \
algorithms                                 LocalOutlierFactor   
df_list          diagnosis  radius_mean  texture_mean  per...   
shapes                                                    540   
skews       [0.6461002842069274, 0.6493382869646654, 0.674...   
kurts       [-1.5825544227477213, -0.23318349604508004, 0....   
skews_sum                                               43.57   
kurts_sum                                              143.32   
r2_score                                                0.773   

                                                            4  
algorithms                                             DBSCAN  
df_list          diagnosis  radius_mean  texture_mean  per...  
shapes                                                    443  
skews       [1.492963669774385, 0.4507097379177925, 0.7420...  
kurts       [0.22894051926616887, 0.3695249571409436, 0.62...  
skews_sum                                               48.79  
kurts_sum                                              176.44  
r2_score                                                0.618  
In [ ]:
print(color_class.BOLD_COLOR +'\nHelper class to make the outlier and original data comparisions....\n'+color_class.END)
class outlier_viz():
    
    def __init__(self,ax,orig_feat = None,corrected_feat = None): 
        self.x_org = orig_feat
        self.x_corr = corrected_feat
        self.ax = ax

    def visualize_data(self,name =None, r2 = None,orig_r2 = None):
        
        self.ax.set_facecolor('white')
       # dimension reduction
        pca1 = PCA(n_components= 2).fit_transform(self.x_org)
        pca2 = PCA(n_components= 2).fit_transform(self.x_corr)
        
        self.ax.scatter(pca1[:,0], pca1[:,1], c = colors[0], s = 50, zorder =0, alpha = 1, linewidth = 1, ec = 'black')
        self.ax.scatter(pca2[:,0], pca2[:,1], c = colors[1], s = 50, zorder = 3, alpha = 1,linewidth = 1, ec = 'black')
   
        self.ax.text(3000,900,'{}'.format(name), {'fontfamily':'serif','size':14,'weight':'bold','color':'black'},alpha= 0.9)
        self.ax.text(3000,800,'R2 Score: {}'.format(r2), {'fontfamily':'serif','size':14,'weight':'bold','color':'black'},alpha= 0.9)
        self.ax.text(3000,700,'Orig R2 Score: {}'.format(orig_r2), {'fontfamily':'serif','size':14,'weight':'bold','color':'black'},alpha= 0.9)

Helper class to make the outlier and original data comparisions....

In [ ]:
fig, ax =plt.subplots(2,2,figsize =(13,9), dpi = 70)
axes = ax.ravel()
for ax in axes:
    ax.set_xticklabels('')
    ax.set_yticklabels('')

# plotting 
orig = outliers_info['df_list'][0]

(outlier_viz(ax = axes[0] , orig_feat = orig, corrected_feat= outliers_info['df_list'][1])
            .visualize_data(name = 'Isolation Forest', r2= outliers_info['r2_score'][1],orig_r2 = outliers_info['r2_score'][0]))

(outlier_viz(ax = axes[1], orig_feat = orig, corrected_feat= outliers_info['df_list'][2])
 .visualize_data(name = 'Eclliptic Envelope',r2= outliers_info['r2_score'][2],orig_r2 = outliers_info['r2_score'][0]))

(outlier_viz(ax = axes[2], orig_feat = orig, corrected_feat= outliers_info['df_list'][3])
 .visualize_data(name = 'Local Outlier Factor',r2= outliers_info['r2_score'][3],orig_r2 = outliers_info['r2_score'][0]))

(outlier_viz(ax = axes[3], orig_feat = orig, corrected_feat= outliers_info['df_list'][4])
 .visualize_data(name = 'DBSCAN',r2= outliers_info['r2_score'][4],orig_r2 = outliers_info['r2_score'][0]))


# text and labels
### title and annotations
## titles and text
#fig.text(-0.05,1.085,'Women and Cancer: Outliers and Original Data', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(-0.05,1.0,'''Looks like evey outlier detection algorithm did a good job, butit is not possible to 
#select the best one out of them, without looking at skew and kurtosis values, 
#lets dive into that next...''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 1)

#0.59
#0.73
#0.74
fig.text(0.29,1, "Original Data",{'fontfamily':'serif','size':22, 'weight':'bold', 'color':colors[0]}, alpha = 1)
fig.text(0.49,1, '|',{'fontfamily':'serif','size':22, 'weight':'bold'})
fig.text(0.54,1, "Corrected Data",{'fontfamily':'serif','size':22, 'weight':'bold','color':colors[1]}, alpha = 1)


fig.text(0.7,-0.01,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)

fig.tight_layout(pad = 1.5, w_pad = 1.5,h_pad = 1.5)
fig.show()
In [ ]:
fig,ax = plt.subplots(1,2, figsize =(12,6), dpi = 100)
axes = ax.ravel()

# total skew plot
for idx in range(1,outliers_info.shape[0]):
    axes[0].barh(width = outliers_info['skews_sum'][0],
                 y = outliers_info['algorithms'][idx], color = colors[0])
axes[0].barh(width = outliers_info['skews_sum'][1:],
             y = outliers_info['algorithms'][1:], color = colors[1])

# total kurtosis plot
for idx in range(1,outliers_info.shape[0]):
    axes[1].barh(width = outliers_info['kurts_sum'][0],
                 y = outliers_info['algorithms'][idx], color = colors[0])
axes[1].barh(width = outliers_info['kurts_sum'][1:],
             y = outliers_info['algorithms'][1:], color = colors[2])

# plot ticks and title setting
axes[1].tick_params(axis = 'y',pad = 95)
axes[1].set_yticklabels(outliers_info['algorithms'][1:], {'fontfamily':'serif','size':16,'weight':'bold'}, ha = 'center')
axes[1].set_xticklabels('')
axes[0].set_xticklabels('')
axes[0].set_yticklabels('')
axes[0].invert_xaxis()


## text and decorations

# skewness annotations
for pa in axes[0].patches:
    axes[0].text(pa.get_width(), pa.get_y()+ pa.get_height()/2, int(pa.get_width()),
                 {'color':'black','fontfamily':'serif','weight':'bold','size':'12'},alpha= 1, va = 'center')
    if pa in axes[0].patches[4:]:
        
        change = int((int(outliers_info['skews_sum'][0]) - int(pa.get_width())) / int(outliers_info['skews_sum'][0]) *100)
        
        axes[0].text(pa.get_width()*0, pa.get_y(),'{}% {}'.format(change,u'\u2193'),
                     {'color':'black','fontfamily':'serif','weight':'bold','size':14},
                     alpha= 0.8,va = 'bottom', ha='right')
      

        
# kurtosis annotations
for pa in axes[1].patches:
    axes[1].text(pa.get_width()-20, pa.get_y()+ pa.get_height()/2, int(pa.get_width()),
                 {'color':'black','fontfamily':'serif','weight':'bold','size':'12'},alpha= 1, va = 'center')
    if pa in axes[1].patches[4:]:
        
        change = int((int(outliers_info['kurts_sum'][0]) - int(pa.get_width())) / int(outliers_info['kurts_sum'][0]) *100)
        
        axes[1].text(pa.get_width()*0, pa.get_y(),'{}% {}'.format(change,u'\u2193'),
                     {'color':'black','fontfamily':'serif','weight':'bold','size':14},alpha= 0.8, va = 'bottom')

        
### title and annotations
## titles and text
#fig.text(-0.05,1.18,' Comparision of Total Skews and Kurtosis', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(-0.05,1.07,'''Total Skews and Total kurtosis means sum of skews,and sum of kurtosis of all features
#respectively. It seems with default setting of 5% points as outliers, Isolation forest
#did well in reducing both skew and kurtosis of data.''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 0.95)

fig.text(0.27,0.99, "Skewness",{'fontfamily':'serif','size':18, 'weight':'bold', 'color':colors[1]})
fig.text(0.40,0.99, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
fig.text(0.45,0.99, "Original",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[0]})
fig.text(0.60,0.99, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
fig.text(0.62,0.99, "Kurtosis",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[2]})

fig.text(0.82,0.0,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)


plt.tight_layout(pad = 1, h_pad = 1, w_pad = 1)

fig.show()
In [ ]:
fig, ax = plt.subplots(1,2,figsize = (12,12))

axes = ax.ravel()

axes[0].invert_xaxis()
axes[0].barh(y = df.drop(columns = ['id']).columns, width=outliers_info['skews'][0].tolist(), color = colors[0],align='center')
axes[0].barh(y = df.drop(columns = ['id']).columns, width=outliers_info['skews'][1].tolist(), color = colors[1],align='center')

axes[1].barh(y = df.drop(columns = ['id']).columns, width=outliers_info['kurts'][0].tolist(), color = colors[0],align='center')
axes[1].barh(y = df.drop(columns = ['id']).columns, width=outliers_info['kurts'][1].tolist(), color = colors[2],align='center')


axes[0].set_yticklabels('')
axes[1].set_yticklabels(df.drop(columns = ['id']).columns, {'fontfamily':'serif','size':12,'weight':'bold'},rotation = 0,ha= 'center')
axes[1].tick_params(axis = 'y',pad = 75)
axes[0].set_xticklabels('')
axes[1].set_xticklabels('')

### title and annaotations
### title and annotations
## titles and text
#fig.text(0,1.09,' Isolation Forest Feature Level Stats ', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(0,1.05,'''We go to know that Isolation forest does a good job with outlier detection with 10% contamination
#and this butterfly plot shows the feature level change in skewness and kurosis values.''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 0.9)

fig.text(0.27,0.99, "Skewness",{'fontfamily':'serif','size':18, 'weight':'bold', 'color':colors[1]})
fig.text(0.40,0.99, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
fig.text(0.45,0.99, "Original",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[0]})
fig.text(0.60,0.99, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
fig.text(0.62,0.99, "Kurtosis",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[2]})

fig.text(0.75,0.05,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)

plt.tight_layout(pad = 1, h_pad = 1, w_pad = 1)
fig.show()
In [ ]:
print(color_class.BOLD_COLOR + '\nSelecting Data without Outliers with Isolation Forest with contamination of 10%\n' + color_class.END)
# selecting df with outliers removed
df = outliers_info['df_list'][1]
print(color_class.BOLD + 'Done!\n')

print('Shape of Original Data: '+\
      color_class.BOLD_COLOR+ str(outliers_info['df_list'][0].shape) + color_class.END)
print(color_class.BOLD + 'Shape of Corrected Data: ' + color_class.END+\
      color_class.BOLD_COLOR+ str(df.shape) + color_class.END)
print(color_class.BOLD_COLOR+ '\nAll set for feature selection...\n' + color_class.END)

Selecting Data without Outliers with Isolation Forest with contamination of 10%

Done!

Shape of Original Data: (569, 31)
Shape of Corrected Data: (540, 31)

All set for feature selection...

In [ ]:
print(color_class.BOLD_COLOR+'\nCutom Correlation matrix values extraction...\n'+color_class.END)
## correlation matrix customization 

corr_df = df.corr()
temp_df = corr_df.stack().reset_index()
temp_df = temp_df[temp_df[0] != 1.0].reset_index(drop = True)
temp_df['z'] = temp_df.apply(lambda x: tuple(sorted([x['level_0'],x['level_1']])), axis = 1)
temp_df.drop_duplicates(subset="z", keep="first" , inplace = True ) 
temp_df.drop(columns = ['z'], inplace = True)
temp_df.reset_index(drop = True,inplace = True)
temp_df['color'] = temp_df[0].apply(lambda x: colors[1] if x <0.25 else ( colors[2]  if ((x > 0.25) & (x<0.85)) else colors[0]))

print(color_class.BOLD +'Correation Matrix data ready for custom visualization...\n')
print(color_class.BOLD)
print(temp_df.head(2))

Cutom Correlation matrix values extraction...

Correation Matrix data ready for custom visualization...


     level_0       level_1         0    color
0  diagnosis   radius_mean  0.735593  #eda760
1  diagnosis  texture_mean  0.394614  #eda760
In [ ]:
## custom heatmap for correlation matrix

fig, ax = plt.subplots(figsize = (10,6), dpi = 85)

# flipping yaxis 
ax.invert_yaxis()

## Creating dop representational plot
ax.scatter(x = temp_df['level_0'], y = temp_df['level_1'],
           s = temp_df[0]*100, c = temp_df['color'], linewidth = 1, edgecolor = 'black')

## plot setting - ticks and labels

x_vals = temp_df['level_0'].value_counts()
y_vals = temp_df['level_1'].value_counts().sort_values()

xticklabels =  [ ' '.join((str(col).capitalize()).split('_')) for col in  x_vals.index]
yticklabels = [ ' '.join((str(col).capitalize()).split('_')) for col in  y_vals.index]

#xticklabels.reverse()

#for x,y,label in zip(x_vals.values, x_vals.values,xticklabels):
#    ax.text(y-1,x-1.5,label,{'font':'serif','size':10,'weight':'bold','color':'black'},rotation = 90, ha = 'center',va = 'bottom', alpha = 0.75)

ax.set_yticklabels(yticklabels,  {'fontfamily':'serif','size':10,'weight':'bold','color':'black'}, alpha = 0.75)    
ax.set_xticklabels(xticklabels,  {'fontfamily':'serif','size':10,'weight':'bold','color':'black'}, alpha = 0.75,rotation = 90)  


## titles and desc

## titles and text
fig.text(-0.05,0.98,' Correlation Matrix and Multi-colinearity', {'fontfamily':'serif','size':20, 'weight':'bold'}, alpha = 1)
fig.text(-0.05,0.92,'''Features could be highly correlated, moderately correalated, and least correlated based on color scheme.
Blanks spaces indicate negative correlations. Multicolinearity exits in data.''',{'fontfamily':'serif','size':12, 'weight':'normal'}, alpha = 0.9)

fig.text(0.37,0.75, "High",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[0]})
fig.text(0.45,0.75, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.48,0.75, "Moderate",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})
fig.text(0.62,0.75, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.65,0.75, "Least",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[1]})

fig.text(0.65,-0.2,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.7)


fig.show()
In [ ]:
print(color_class.BOLD_COLOR  +'\nVariance inflation factor algorithm in a nutshell...\n'+color_class.END)
def VIF(data):
    vif_list = list()
    for col in data.columns:
        X = data.drop(columns = [col])
        y = data[col]
        model = LinearRegression().fit(X.values,y.values)
        ypreds = model.predict(X.values)
        r2 = r2_score(y.values,ypreds)
        VIF = 1 /(1-r2)
        vif_list.append(VIF)
    return vif_list

Variance inflation factor algorithm in a nutshell...

In [ ]:
print(color_class.BOLD_COLOR + '\nVariance Inflation factor implementation for feature selection.... \n'+ color_class.END)
VIF_max = 1000
VIF_dfs = {}
n = 0
while int(VIF_max) > 10:   
    try:
        data = data.drop(columns = [temp['Features'][0]])
        temp = (pd.DataFrame({'Features':data.columns,'VIF':VIF(data)})
        .sort_values(by = 'VIF', ascending = False).reset_index(drop = True))
    except:
        data = df.drop(columns = ['diagnosis'])
        temp = (pd.DataFrame({'Features':df.drop(columns = ['diagnosis']).columns,'VIF':VIF(data)})
        .sort_values(by = 'VIF', ascending = False).reset_index(drop = True))
    
    VIF_max = temp['VIF'][0]
    VIF_dfs['iter_{}'.format(n)] = temp
    n+=1
del temp

print(color_class.BOLD + '\nCalcuations are finished! feature and corresponding VIF are stored in VIF_dfs list\n'+color_class.END)
print(color_class.BOLD_COLOR + 'Final Features In Data and Final VIFs...'+ color_class.END)
print(color_class.BOLD)
print(VIF_dfs['iter_{}'.format(n-1)])

Variance Inflation factor implementation for feature selection.... 


Calcuations are finished! feature and corresponding VIF are stored in VIF_dfs list

Final Features In Data and Final VIFs...

                   Features       VIF
0          smoothness_worst  9.885053
1   fractal_dimension_worst  9.021375
2    fractal_dimension_mean  7.652832
3            compactness_se  7.396592
4            symmetry_worst  6.632449
5           smoothness_mean  6.414340
6      fractal_dimension_se  6.394734
7              concavity_se  5.143290
8             smoothness_se  4.324478
9         concave points_se  4.114929
10                area_mean  4.072539
11              symmetry_se  3.930165
12            symmetry_mean  3.351743
13                radius_se  2.867922
14               texture_se  2.011996
15             texture_mean  1.688786
In [ ]:
print(color_class.BOLD_COLOR + '\nMerging VIF iteration history dataframes to understand how algoirthm worked....\n'+ color_class.END)

### merging all the dataframes from VIF feature selection implementation
for key,value in VIF_dfs.items():
    if key == 'iter_0':
        base = value
    else:
        base = pd.merge(left = base, right = value, on = 'Features', how = 'outer')
    base = base
VIF_matrix = base.fillna(0).set_index('Features',drop = True)
del base 
VIF_matrix.columns = VIF_dfs.keys()

## write up
print(color_class.BOLD + '\nDataframes merged and stored data into' + color_class.END + color_class.BOLD_COLOR+ ' VIF_matrix' + color_class.END)
print(color_class.BOLD_COLOR+ '\nExtracting data for custom visualization....\n' + color_class.END)


### Extracting data for custom visualization
temp_df = VIF_matrix.apply(lambda x: x/x.max(), axis = 0).stack().reset_index() # noralizing values and stacking

## there wont be any duplicates, just to make sure do this drop
temp_df['z'] = temp_df.apply(lambda x: tuple(sorted([x['Features'],x['level_1']])), axis = 1)
temp_df.drop_duplicates(subset="z", keep="first" , inplace = True ) 
temp_df.drop(columns = ['z'], inplace = True)
temp_df.reset_index(drop = True,inplace = True)
temp_df['color'] = temp_df[0].apply(lambda x: colors[2] if x <1  else colors[0])


## write up
print(color_class.BOLD + '\nDone!!' + color_class.END)

Merging VIF iteration history dataframes to understand how algoirthm worked....


Dataframes merged and stored data into VIF_matrix

Extracting data for custom visualization....


Done!!
In [ ]:
## custom heatmap for correlation matrix

fig, ax = plt.subplots(figsize = (10,6), dpi = 85)

# flipping yaxis 
ax.invert_yaxis()

## Creating dop representational plot
ax.scatter(y = temp_df['Features'], x = temp_df['level_1'],
           s = temp_df[0]*120, c = temp_df['color'], linewidth = 1, edgecolor = 'black')


ax.axvspan(xmin = 13.5, xmax = 14.5, color = colors[1], alpha = 0.25,zorder = 0)

## plot setting - ticks and labels

y_vals = temp_df['Features'].unique()
x_vals = temp_df['level_1'].unique()

xticklabels =  [ ' '.join((str(col).capitalize()).split('_')) for col in  x_vals]
yticklabels = [ ' '.join((str(col).capitalize()).split('_')) for col in  y_vals]

#xticklabels.reverse()

#for x,y,label in zip(x_vals.values, x_vals.values,xticklabels):
#    ax.text(y-1,x-1.5,label,{'font':'serif','size':10,'weight':'bold','color':'black'},rotation = 90, ha = 'center',va = 'bottom', alpha = 0.75)

ax.set_yticklabels(yticklabels,  {'fontfamily':'serif','size':10,'weight':'bold','color':'black'}, alpha = 0.75)    
ax.set_xticklabels(xticklabels,  {'fontfamily':'serif','size':10,'weight':'bold','color':'black'}, alpha = 0.75,rotation = 90)  

ax.annotate('This Feature drops \nin next iteration', xy=(0.,0), xytext=(2, -1),
             arrowprops=dict(facecolor='white',arrowstyle="->",
                             connectionstyle="arc3,rad=.5",color='black',linewidth=0.8, alpha = 0.7), 
             #bbox = dict(boxstyle ="round", fc ="white", pad =0.25,color = 'darkorange'),
            fontsize=8,fontfamily='serif',fontweight ='bold',ha='center', color='black', zorder = 3,
            annotation_clip = False, alpha = 0.85)
ax.annotate('Final Features wrt VIF Feature Removal', xy=(14.75,25), xytext=(14.75,25),
            fontsize=8,fontfamily='serif',fontweight ='bold',ha='center', color='black', zorder = 3,
            annotation_clip = False, alpha = 0.85, rotation = 90)


## titles and desc

## titles and text
fig.text(-0.05,0.98,' Normalized VIF of Features with Iterations', {'fontfamily':'serif','size':20, 'weight':'bold'}, alpha = 1)
fig.text(-0.05,0.92,'''VIF are calculated for each feature, and removed the highest VIF feature (Purple) for next iteration.
Clearly all the highly correlated features are removed''',{'fontfamily':'serif','size':12, 'weight':'normal'}, alpha = 0.9)

fig.text(0.5,0.90, "Highest VIF",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[0]})
fig.text(0.65,0.90, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.68,0.90, "Moderate VIF",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})


fig.text(0.65,-0.05,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.7)


fig.show()
In [ ]:
X_temp = df.drop(columns = ['diagnosis'])
y_temp = df['diagnosis']
temp_X_train,temp_X_val,temp_y_train,temp_y_val = train_test_split(X_temp,y_temp, test_size = 0.2, random_state = 2021)

temp_model= XGBClassifier(eval_metric='logloss').fit(temp_X_train,temp_y_train)

perm = PermutationImportance(temp_model, scoring = 'r2').fit(temp_X_val,temp_y_val)
eli5_feature_importance1 = (pd.DataFrame({'Features':temp_X_train.columns.tolist(),'Importance':perm.feature_importances_})
                           .sort_values(by = 'Importance'))
perm_imp_feats1 = (eli5_feature_importance1.sort_values(by = 'Importance', ascending = False)
                         .reset_index(drop = True))['Features'][0:15]

print(color_class.BOLD_COLOR + 'Feature Importance with r2 metric same as VIF....')
eli5.show_weights(perm, feature_names = temp_X_val.columns.tolist())
Feature Importance with r2 metric same as VIF....
Out[ ]:
Weight Feature
0.0930 ± 0.0986 area_worst
0.0761 ± 0.0986 texture_worst
0.0507 ± 0.0633 texture_mean
0.0338 ± 0.1353 concave points_worst
0.0338 ± 0.0338 radius_worst
0.0169 ± 0.0414 area_se
0.0169 ± 0.0676 fractal_dimension_se
0 ± 0.0000 area_mean
0 ± 0.0000 compactness_mean
0 ± 0.0000 symmetry_mean
0 ± 0.0000 fractal_dimension_mean
0 ± 0.0000 smoothness_se
0 ± 0.0000 radius_se
0 ± 0.0000 texture_se
0 ± 0.0000 perimeter_se
0 ± 0.0000 radius_mean
0 ± 0.0000 concavity_se
0 ± 0.0000 symmetry_worst
0 ± 0.0000 perimeter_mean
0 ± 0.0000 fractal_dimension_worst
… 10 more …
In [ ]:
print(color_class.BOLD_COLOR + '\nFitting train data on linear regression to get accuracy,r2 ,and roc_aoc scores...\n' + color_class.END)
vif_features = VIF_dfs['iter_14']['Features'].values.tolist()
def test_linear_features():
        X_orig = df.drop(columns = ['diagnosis'])
        y_orig = df['diagnosis']
        X_train,X_val,y_train,y_val = train_test_split(X_orig,y_orig, test_size = 0.2, random_state = 2021)
        
        # linear regression
        orig_preds = LinearRegression().fit(X_train.values,y_train.values).predict(X_val.values)
        vif_preds = LinearRegression().fit(X_train[vif_features].values,y_train.values).predict(X_val[vif_features].values)
        perm_preds = LinearRegression().fit(X_train[perm_imp_feats1].values,y_train.values).predict(X_val[perm_imp_feats1].values)
        
        
        orig_r2 = round(r2_score(y_val, orig_preds),3)
        vif_r2 = round(r2_score(y_val, vif_preds),3)
        perm_r2 = round(r2_score(y_val, perm_preds),3)
        
        from sklearn.metrics import f1_score

        orig_ypred_class = orig_preds > 0.85
        vif_ypred_class = vif_preds > 0.85
        perm_ypred_class = perm_preds > 0.85
        
        orig_auc = round(accuracy_score(y_val, orig_ypred_class),3)
        vif_auc = round(accuracy_score(y_val, vif_ypred_class),3)
        perm_auc = round(accuracy_score(y_val, perm_ypred_class),3)
        
        orig_roc_auc = round(roc_auc_score(y_val, orig_preds),3)
        vif_roc_auc = round(roc_auc_score(y_val, vif_preds),3)
        perm_roc_auc = round(roc_auc_score(y_val, perm_preds),3)
       
    
        orig_list = [orig_auc,orig_r2,orig_roc_auc]
        vif_list = [vif_auc, vif_r2,vif_roc_auc]
        perm_list = [perm_auc,perm_r2,perm_roc_auc]
        return orig_list,vif_list,perm_list
        
orig_list,vif_list, perm_list = test_linear_features()
print(color_class.BOLD + '\nAccuracy score, r2 score, and roc_auc score:\n ' + color_class.END)
print(color_class.BOLD + 'Orginal Features: ' + color_class.END + color_class.BOLD_COLOR + str(orig_list) + color_class.END + '\n')
print(color_class.BOLD + 'Permutation Feature Selection: ' + color_class.END + color_class.BOLD_COLOR + str(vif_list) + color_class.END + '\n')
print(color_class.BOLD + 'Variance Infaltion Factor based Feature Selection: ' + color_class.END + color_class.BOLD_COLOR + str(perm_list) + color_class.END + '\n')

Fitting train data on linear regression to get accuracy,r2 ,and roc_aoc scores...


Accuracy score, r2 score, and roc_auc score:
 
Orginal Features: [0.852, 0.714, 0.972]

Permutation Feature Selection: [0.843, 0.717, 0.987]

Variance Infaltion Factor based Feature Selection: [0.806, 0.703, 0.986]

In [ ]:
print(color_class.BOLD_COLOR +'\nHelper function for the PCA visualization with diagnosis hue... \n'+ color_class.END)

class pca_viz():
    
    def __init__(self,feat,tar,ax): 
        self.feat = feat
        self.tar = tar
        self.ax = ax

    def visualize_data(self):
        
        temp_y = pd.DataFrame({'y':self.tar})
        
        pca = PCA(n_components= 2).fit_transform(self.feat)
        # plotting
        self.ax.scatter(pca[temp_y['y'] == 0][:,0], pca[temp_y['y'] == 0][:,1], c = colors[2], s = 50, linewidth =1, ec = 'black')
        self.ax.scatter(pca[temp_y['y'] == 1][:,0], pca[temp_y['y'] == 1][:,1], c = colors[0], s = 50, linewidth =1,ec ='black')
   
        #self.ax.set_xticklabels('')
        #self.ax.set_yticklabels('')

Helper function for the PCA visualization with diagnosis hue... 

In [ ]:
fig = plt.figure(figsize =(14,14))
gs = fig.add_gridspec(10,10)
gs.update(wspace = 10,hspace = 2)

#ax0 = fig.add_subplot(gs[:,:])
ax1 = fig.add_subplot(gs[1:5, 0:5])
ax2 = fig.add_subplot(gs[1:5, 5:10])
ax3 = fig.add_subplot(gs[6:10, 2:8])


axes = [ax1,ax2,ax3]

data_ = df.copy()

## data with outlier removal 
X_orig = data_.drop(columns = ['diagnosis']).values
X_perm = data_.drop(columns = ['diagnosis'])[perm_imp_feats1].values
y_orig = data_['diagnosis'].values

## data with VIF featue selection

X_vif = data_[vif_features].values

# plots
pca_viz(feat = X_perm,tar = y_orig,ax=axes[0]).visualize_data()
pca_viz(feat = X_vif, tar = y_orig,ax=axes[1]).visualize_data()
pca_viz(feat = X_orig,tar = y_orig,ax=axes[2]).visualize_data()


# text and title
## titles and text
axes[0].text(-50,570, 'Permutation Feature Selection',{'fontfamily':'serif','size':14, 'weight':'bold'}, zorder =3)
axes[1].text(-100,22.5, 'VIF Feature Selection',{'fontfamily':'serif','size':14, 'weight':'bold'})
axes[2].text(0,675, 'Original Features',{'fontfamily':'serif','size':14, 'weight':'bold'})

## add scores

#permutation annotations
axes[0].annotate('Acc: {}'.format(perm_list[0]),(1600,440),(1600,440), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[0].annotate(' R2: {}'.format(perm_list[1]),(1600,400),(1600,400), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[0].annotate('AUC: {}'.format(perm_list[2]),(1600,350),(1600,350), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)

#VIF annotations
axes[1].annotate('Acc: {}'.format(vif_list[0]),(1000,18),(1000,18), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[1].annotate(' R2: {}'.format(vif_list[1]),(1000,16.5),(1000,16.5), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[1].annotate('AUC: {}'.format(vif_list[2]),(1000,15),(1000,15), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)

# Original annotations
axes[2].annotate('Acc: {}'.format(orig_list[0]),(1900,510),(1900,510), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[2].annotate(' R2: {}'.format(orig_list[1]),(1900,470),(1900,470), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[2].annotate('AUC: {}'.format(orig_list[2]),(1900,425),(1900,425), zorder =3, annotation_clip = False,
                 fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)


# text and titles

fig.text(0.05,0.95,' Linear model Perforamace with PFI,VIF,and Original Data', {'fontfamily':'serif','size':20, 'weight':'bold'}, alpha = 1)
fig.text(0.05,0.9,'''Though the feature selection is done based on r2 metric, for comparision 
of accuracy,r2, and auc scores among PFI, VIF and Orginal data with LinearRegression.
VIF based feature selection should give edge here...  ''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.9)

fig.text(0.40,0.88, "Cancerous",{'fontfamily':'serif','size':16, 'weight':'bold', 'color':colors[0]})
fig.text(0.52,0.88, '|',{'fontfamily':'serif','size':16, 'weight':'bold'})
fig.text(0.55,0.88, "Healthy",{'fontfamily':'serif','size':16, 'weight':'bold','color':colors[2]})

fig.text(0.75,0.075,'© Made by Milon',{'fontfamily':'serif', 'size':9,'weight':'bold'}, alpha = 0.8)

fig.show()
In [ ]:
X_temp = df.drop(columns = ['diagnosis'])
y_temp = df['diagnosis']
temp_X_train,temp_X_val,temp_y_train,temp_y_val = train_test_split(X_temp,y_temp, test_size = 0.2, random_state = 2021)

temp_model= XGBClassifier(eval_metric='logloss').fit(temp_X_train,temp_y_train)

perm = PermutationImportance(temp_model, scoring = 'roc_auc').fit(temp_X_val,temp_y_val)
eli5_feature_importance2 = (pd.DataFrame({'Features':temp_X_train.columns.tolist(),'Importance':perm.feature_importances_})
                           .sort_values(by = 'Importance'))
perm_imp_feats_auc = (eli5_feature_importance2.sort_values(by = 'Importance', ascending = False)
                         .reset_index(drop = True))['Features']

print(color_class.BOLD_COLOR + 'Feature Importance with roc_auc metric....')
eli5.show_weights(perm, feature_names = temp_X_val.columns.tolist())
Feature Importance with roc_auc metric....
Out[ ]:
Weight Feature
0.0298 ± 0.0369 texture_worst
0.0258 ± 0.0135 area_worst
0.0122 ± 0.0064 area_se
0.0091 ± 0.0137 concave points_worst
0.0081 ± 0.0042 compactness_se
0.0046 ± 0.0054 concavity_worst
0.0032 ± 0.0057 symmetry_se
0.0030 ± 0.0046 perimeter_worst
0.0027 ± 0.0060 concave points_mean
0.0024 ± 0.0019 fractal_dimension_se
0.0020 ± 0.0030 concavity_se
0.0020 ± 0.0107 radius_worst
0.0016 ± 0.0025 symmetry_worst
0.0009 ± 0.0012 texture_mean
0.0007 ± 0.0028 radius_mean
0.0006 ± 0.0006 area_mean
0.0005 ± 0.0018 symmetry_mean
0.0003 ± 0.0003 perimeter_se
0.0002 ± 0.0026 smoothness_worst
0.0002 ± 0.0008 fractal_dimension_worst
… 10 more …
In [ ]:
## dataframe as per feature selection from permutation importance
temp_X_df = df.drop(columns ='diagnosis').copy()
temp_X_df = temp_X_df[perm_imp_feats_auc]
temp_y_df = df['diagnosis']

## crossvalidation with repeated feature selection
stratified = StratifiedKFold(n_splits = 5,shuffle = True, random_state = 2021)
 
feat_acc = []
feat_auc = []
feat_f1 = []
for idx,feat in enumerate (perm_imp_feats_auc):
    temp = temp_X_df.iloc[:,:idx]
    temp['all_other'] = temp_X_df.iloc[:,idx:len(perm_imp_feats_auc)].sum(axis = 1)
    X_ = temp
    y_ = temp_y_df
    
    if idx == 0:
        continue
    else:
        fold_acc = []
        fold_auc = []
        fold_f1 = []
        for train_idx,valid_idx in stratified.split(X_,y_):
    
            xtrain,xvalid = X_.iloc[train_idx],X_.iloc[valid_idx]
            ytrain,yvalid = y_.iloc[train_idx],y_.iloc[valid_idx]
            
            model = XGBClassifier(eval_metric = 'logloss').fit(xtrain.values,ytrain.values)
            preds = model.predict(xvalid.values)
            
            acc_score= accuracy_score(yvalid,preds)
            auc_score = roc_auc_score(yvalid,preds)
            f1 = f1_score(yvalid,preds)
            fold_acc.append(acc_score)
            fold_auc.append(auc_score)
            fold_f1.append(f1)
            

    feat_acc.append(round(np.mean(fold_acc),2))  
    feat_auc.append(round(np.mean(fold_auc),2))
    feat_f1.append(round(np.mean(fold_f1),2))
In [ ]:
fig,ax = plt.subplots(1,2,figsize =(12,6))

## accuracy vs number of features from permutation importance based feature selection
ax[0].plot(np.arange(0,len(feat_acc),1),feat_acc, color = colors[0], linewidth = 2)
ax[0].scatter(x =np.arange(0,len(feat_acc),1),y=feat_acc, 
              color = colors[1], s = 75,zorder = 3,
              linewidth = 1,ec = 'black')
ax[0].set_ylabel('Cross-Validation Accuracy Mean',{'fontfamily':'serif','size':12, 'weight':'bold'}, alpha = 0.95)
ax[0].set_xlabel('Number of Features',{'fontfamily':'serif','size':12, 'weight':'bold'}, alpha = 0.95)

## area under curve vs number of features from permutation importance based feature selection
ax[1].plot(np.arange(0,len(feat_auc),1),feat_auc, color=colors[0], linewidth = 2)
ax[1].scatter(x =np.arange(0,len(feat_auc),1),y=feat_auc,
              color= colors[2], s= 75,zorder =3,
               linewidth = 1,ec = 'black')
ax[1].set_ylabel('Cross-Validation AUC Mean',{'fontfamily':'serif','size':12, 'weight':'bold'}, alpha = 0.95)
ax[1].set_xlabel('Number of Features',{'fontfamily':'serif','size':12, 'weight':'bold'}, alpha = 0.95)

### title and annotations
## titles and text
fig.text(-0.05,1.18,' Influence of Number of Features on Metric', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(-0.05,1.07,'''This plot shows clearly that, even single feature from feature selection is giving
0.92 accuracy, and with increase in number of features accuracy and auc increased. 
But not after 10 to 15 featrues theres in nothing much of change.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.95)

fig.text(0.35,0.99, "Accuracy",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[1]})
fig.text(0.50,0.99, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.55,0.99, "Area Under Curve",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})


fig.text(0.70,-0.01,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)

fig.tight_layout(pad = 2.5, w_pad = 2.5)

fig.show()
In [ ]:
temp_df = temp_X_df.iloc[:,0:14]
temp_df['all_other'] = temp_X_df.iloc[:,14:len(perm_imp_feats_auc)].sum(axis = 1)
cols = temp_df.columns
temp_xtrain,temp_xtest, temp_ytrain,temp_ytest = train_test_split(temp_df, temp_y_df, test_size = 0.2)

temp_model = XGBClassifier(eval_metric = 'logloss')
temp_model.fit(temp_xtrain,temp_ytrain)

### shapvalues 
explainer = shap.TreeExplainer(temp_model)

shap_values = explainer.shap_values(temp_xtest)

cmap = mpl.colors.LinearSegmentedColormap.from_list("",[colors[1],colors[2],colors[3]])
shap.summary_plot(shap_values,temp_xtest,
                  show = False,cmap = cmap)

# plot settings
## titles and text
plt.gcf().text(-0.1,1.1,' SHAP Values and Features', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
plt.gcf().text(-0.1,0.98,'''This visualizaiton enables us to understand the feature importance
and global interpretation. ...''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.95)


plt.gcf().text(0.65,-0.01,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)



plt.gcf().show()
In [ ]:
### helper function
def plot_feat(axes_idx = None, data_ = None, scaler_method = None, color = None):
    
    col_names = data_.columns
  
    scaled_array = scaler_method.fit_transform( data_)
    
    # scaled dataframe
    scaled_df = pd.DataFrame(scaled_array, columns=col_names)
     
    skew_scaler = []
    for idx, col in zip(axes_idx, col_names):
        col_skew = skew(scaled_df[col])
        if col_skew > 1.5: 
            temp = np.log1p(scaled_df[col] + 0.5)
        else:
            temp = scaled_df[col]
       
        ## plot
        sns.kdeplot(x = temp, ax = axes[idx],
                   color = color,fill = True, alpha = 1,
                   linewidth = 3,ec = 'black')
        
        
        skew_scaler.append(col_skew)
        
        xlabel = ' '.join([value.capitalize() for value in str(col).split('_') ])
        #ax.set_facecolor(colors[-1])
        axes[idx].axes.get_yaxis().set_visible(False)
        axes[idx].axes.set_xlabel(xlabel,{'fontfamily':'serif','size':10, 'weight':'bold'}, alpha = 1)
        
    return skew_scaler
        
### Scalers ans axis indicies

scaler_list = [StandardScaler(), MinMaxScaler(), RobustScaler()]
axes_np_list = [np.arange(0,30,3).tolist(), np.arange(1,30,3).tolist(), np.arange(2,30,3).tolist()]
colors_list = [colors[0],colors[1],colors[2]]
data = temp_X_df.iloc[:,0:14]
    
## plotting 
fig,ax = plt.subplots(15,3, figsize = (10,20))
axes = ax.ravel()

scaler_skews = []
for axes_idx_list,scaler, color in zip(axes_np_list,scaler_list, colors_list):
    
    skewness = plot_feat(axes_idx = axes_idx_list, data_ = data, scaler_method = scaler,color=color)
    scaler_skews.append(skewness)
    
    
plt.tight_layout()


## titles and text
fig.text(0,1.045,' Influence of Scaling on Data', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(0,1.02,'''Three Common approches for Data Scaling are explored here...As all the
#outliers are removed and we couldnt expect much of change... ''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 1)

fig.text(0.22,1.005, "Standardization\nStandardScaler",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[0]})
fig.text(0.44,1.01, '|',{'fontfamily':'serif','size':27, 'weight':'bold'})
fig.text(0.50,1.005, "Normalization\nMinMaxScaler",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[1]})
fig.text(0.72,1.01, '|',{'fontfamily':'serif','size':27, 'weight':'bold'})
fig.text(0.76,1.005, "OutlierRemoval\nRobustScaler",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})


fig.text(0.73,0,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)

fig.show()
In [ ]:
print(color_class.BOLD_COLOR + '\nFinal Data Scaling with StandardScaler.....\n'+color_class.END)

## final data
xdata= df.drop(columns = ['diagnosis'])
xdata = xdata[perm_imp_feats_auc].iloc[:,0:15]
ydata = df['diagnosis']

## final data shapes
print(color_class.BOLD + '\nShape of features Data: '+color_class.END+\
      color_class.BOLD_COLOR+ str(xdata.shape) + color_class.END)
print(color_class.BOLD + 'Shape of target Data: ' + color_class.END+\
      color_class.BOLD_COLOR+ str(ydata.shape) + color_class.END)
print(color_class.BOLD_COLOR+ '\nAll set for final modeling...\n' + color_class.END)

Final Data Scaling with StandardScaler.....


Shape of features Data: (540, 15)
Shape of target Data: (540,)

All set for final modeling...

In [ ]:
classifiers = []
classifiers.append(LogisticRegression(random_state = 2021))
classifiers.append(SVC(random_state=2021, probability = True))
classifiers.append(KNeighborsClassifier())
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=2021)))
classifiers.append(RandomForestClassifier(random_state=2021))
classifiers.append(GradientBoostingClassifier(random_state=2021))
classifiers.append(ExtraTreesClassifier(random_state= 2021))
classifiers.append(XGBClassifier(random_state = 2021,eval_metric = 'logloss'))
classifiers.append(LGBMClassifier(random_state = 2021))
In [ ]:
stratified = StratifiedKFold(n_splits = 5, shuffle = True, random_state  = 20)
Algorithms = ["Logistic","SVC","KNeighbors","AdaBoost",
              "RandomForest","GradientBoosting",
              "ExtraTrees","XGBoost", "LightGBM"]

## empty lists to stores values and states
class_accuracy = []
class_f1 = []
class_auc = []
class_preds = []
class_class_states= []
class_valid_truths = []
class_valid_features = []
class_cm = []

for classifier,algo in zip(classifiers,Algorithms):
    
    fold_accuracy = []
    fold_f1 = []
    fold_roc_auc = []
    fold_preds = []
    fold_class_states = []
    fold_valid_truths = []
    fold_valid_features = []
    fold_cm = []
   
    n = 0
    
    print(color_class.BOLD + '*'*17+  color_class.END + color_class.BOLD_COLOR + str(algo) + color_class.END + color_class.BOLD +  '*'*17 + color_class.END)
    for train_idx,valid_idx in stratified.split(xdata,ydata):
        xtrain,xvalid = xdata.iloc[train_idx],xdata.iloc[valid_idx]
        ytrain,yvalid = ydata.iloc[train_idx],ydata.iloc[valid_idx]
    
        ## scaling 
        ss = StandardScaler()
        xtrain = ss.fit_transform(xtrain)
        xvalid = ss.transform(xvalid)
    
        
        # model 
        model = classifier
        model.fit(xtrain,ytrain)
        preds = model.predict(xvalid)
        
        ## scores
        #### fold results, feaures,preds, states
        accuracy = accuracy_score(yvalid, preds)
        f1 = f1_score(yvalid,preds)
        roc_auc = roc_auc_score(yvalid,preds)
        cm = confusion_matrix(yvalid,preds)
        
        fold_accuracy.append(accuracy)
        fold_f1.append(f1)
        fold_roc_auc.append(roc_auc)
        fold_preds.append(preds)
        fold_class_states.append(model)
        fold_valid_truths.append(np.array(yvalid).astype(int))
        fold_valid_features.append(xvalid)
        fold_cm.append(cm)
        
        
        ## printing results 
        print(color_class.BOLD)
        print("fold{}: Accuracy: {}, F1:{}, Roc_Auc: {} ".format(n, round(accuracy,2),round(f1,2),round(roc_auc,2)))
        print(color_class.END)
        
        n+=1
        
    #### class results, feaures,preds, states
    class_accuracy.append(fold_accuracy)
    class_f1.append(fold_f1)
    class_auc.append(fold_auc)
    class_preds.append(fold_preds)    
    class_valid_truths.append(fold_valid_truths)
    class_valid_features.append(fold_valid_features)
    class_cm.append(fold_cm)
    class_class_states.append(fold_class_states)
    
    ## breif result dynamic prints

    print( color_class.BOLD+ '\n'+'*'*10 +'Means'+ '*'*10+'\n' + color_class.END)
    print(color_class.BOLD_COLOR)
    print('Accuracy Mean: {}'.format(round(np.mean(fold_accuracy),2)))
    print('F1 Mean: {}'.format(round(np.mean(fold_f1),2)))
    print('ROC_AUC Mean: {}'.format(round(np.mean(fold_roc_auc),2)))
    print(color_class.END)
    print('\n'+  color_class.BOLD+'*'*30 + color_class.END +'\n')
*****************Logistic*****************

fold0: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.95 


fold1: Accuracy: 0.99, F1:0.99, Roc_Auc: 0.99 


fold2: Accuracy: 0.99, F1:0.99, Roc_Auc: 0.99 


fold3: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 


fold4: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.96 


**********Means**********


Accuracy Mean: 0.98
F1 Mean: 0.96
ROC_AUC Mean: 0.97


******************************

*****************SVC*****************

fold0: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.97 


fold1: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 


fold2: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.99 


fold3: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.97 


fold4: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.95 


**********Means**********


Accuracy Mean: 0.97
F1 Mean: 0.96
ROC_AUC Mean: 0.97


******************************

*****************KNeighbors*****************

fold0: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 


fold1: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.94 


fold2: Accuracy: 0.99, F1:0.99, Roc_Auc: 0.99 


fold3: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.97 


fold4: Accuracy: 0.94, F1:0.92, Roc_Auc: 0.94 


**********Means**********


Accuracy Mean: 0.97
F1 Mean: 0.95
ROC_AUC Mean: 0.96


******************************

*****************AdaBoost*****************

fold0: Accuracy: 0.92, F1:0.88, Roc_Auc: 0.9 


fold1: Accuracy: 0.94, F1:0.91, Roc_Auc: 0.93 


fold2: Accuracy: 0.92, F1:0.89, Roc_Auc: 0.94 


fold3: Accuracy: 0.92, F1:0.88, Roc_Auc: 0.91 


fold4: Accuracy: 0.95, F1:0.94, Roc_Auc: 0.96 


**********Means**********


Accuracy Mean: 0.93
F1 Mean: 0.9
ROC_AUC Mean: 0.93


******************************

*****************RandomForest*****************

fold0: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.97 


fold1: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 


fold2: Accuracy: 0.95, F1:0.94, Roc_Auc: 0.96 


fold3: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.94 


fold4: Accuracy: 0.94, F1:0.92, Roc_Auc: 0.94 


**********Means**********


Accuracy Mean: 0.96
F1 Mean: 0.94
ROC_AUC Mean: 0.95


******************************

*****************GradientBoosting*****************

fold0: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 


fold1: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 


fold2: Accuracy: 0.94, F1:0.93, Roc_Auc: 0.96 


fold3: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 


fold4: Accuracy: 0.94, F1:0.92, Roc_Auc: 0.94 


**********Means**********


Accuracy Mean: 0.96
F1 Mean: 0.94
ROC_AUC Mean: 0.95


******************************

*****************ExtraTrees*****************

fold0: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.97 


fold1: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.93 


fold2: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.98 


fold3: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.97 


fold4: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.95 


**********Means**********


Accuracy Mean: 0.96
F1 Mean: 0.95
ROC_AUC Mean: 0.96


******************************

*****************XGBoost*****************

fold0: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.95 


fold1: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 


fold2: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.97 


fold3: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 


fold4: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.95 


**********Means**********


Accuracy Mean: 0.96
F1 Mean: 0.94
ROC_AUC Mean: 0.95


******************************

*****************LightGBM*****************

fold0: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 


fold1: Accuracy: 0.94, F1:0.9, Roc_Auc: 0.91 


fold2: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.97 


fold3: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 


fold4: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.97 


**********Means**********


Accuracy Mean: 0.96
F1 Mean: 0.94
ROC_AUC Mean: 0.95


******************************

In [ ]:
print(color_class.BOLD_COLOR+ 'Storing results into dataframe....\n'+ color_class.END)

results_df = (pd.DataFrame({'Algorithms': Algorithms,
                            'Mean Accuracy':np.row_stack(class_accuracy).mean(axis = 1), 
                            'Mean F1':np.row_stack(class_f1).mean(axis = 1),
                            'Mean Roc_Auc':np.row_stack(class_accuracy).mean(axis = 1),
                            'Classifier Preds':class_preds,
                            'Classifier Valid Truths':class_valid_truths, 
                            'Classifier Valid Features':class_valid_features,
                            'Classifier CM':class_cm,
                            'Classifier States':class_class_states})
              .sort_values(by = 'Mean F1',ascending =True)
              .reset_index(drop = True))

results_df['Preds_array'] = results_df['Classifier Preds'].apply(lambda x: np.array(x).ravel())
results_df['Truths_array'] = results_df['Classifier Valid Truths'].apply(lambda x: np.array(x).ravel())

print(color_class.BOLD)
print(results_df.head(1).T)
Storing results into dataframe....


                                                                           0
Algorithms                                                          AdaBoost
Mean Accuracy                                                       0.927778
Mean F1                                                             0.898997
Mean Roc_Auc                                                        0.927778
Classifier Preds           [[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0,...
Classifier Valid Truths    [[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,...
Classifier Valid Features  [[[0.345677598557644, 0.10977870176279673, 0.0...
Classifier CM              [[[67, 3], [6, 32]], [[66, 4], [3, 35]], [[61,...
Classifier States          [(DecisionTreeClassifier(random_state=96986675...
Preds_array                [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...
Truths_array               [1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, ...
In [ ]:
fig, ax = plt.subplots(1,2,figsize = (18,10))

axes = ax.ravel()

axes[0].invert_xaxis()
color_list = results_df['Mean Accuracy'].apply(lambda x: colors[2])
axes[0].barh(y = results_df['Algorithms'], width = round(results_df['Mean Accuracy'],3), height = 0.5, color = color_list)
for pa in ax[0].patches:
    ax[0].text(pa.get_width(),pa.get_y()+pa.get_height()/2, pa.get_width(), ha = 'right', va = 'center',
              **{'fontfamily':'serif','size':10,'weight':'bold'})


color_list1 = results_df['Mean F1'].apply(lambda x: colors[0] )
axes[1].barh(y = results_df['Algorithms'], width = round(results_df['Mean F1'],3), height = 0.5, color = color_list1)
for pa in ax[1].patches:
    ax[1].text(pa.get_width(),pa.get_y()+pa.get_height()/2, pa.get_width(), ha = 'left',va = 'center',
              **{'fontfamily':'serif','size':10,'weight':'bold'})


axes[0].set_yticklabels('')
axes[1].set_yticklabels(results_df['Algorithms'], {'fontfamily':'serif','size':12,'weight':'bold'},rotation = 0,ha= 'center')
axes[1].tick_params(axis = 'y',pad = 75)
axes[0].set_xticklabels('')
axes[1].set_xticklabels('')


## titles and text
fig.text(0,0.945,' Crossvalidation Fold Means and Classifiers', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0,0.89,'''It seems both Logitsticregression,and SVC classifiers are doing best job. Even F1 score is 
good for the given models. Adaboost and Decision Tree are kind of over fitted data.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.9)

fig.text(0.15,0.825, "Fold Accuracy Mean",{'fontfamily':'serif','size':18, 'weight':'bold', 'color':'black'})
fig.text(0.5,0.825, '|',{'fontfamily':'serif','size':24, 'weight':'bold'})
fig.text(0.65,0.825, "Fold F1 Score Mean",{'fontfamily':'serif','size':18, 'weight':'bold','color':'black'})


fig.text(0.72,0.15,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)





fig.tight_layout(pad = 10,w_pad = 1, h_pad = 10)
fig.show()
In [ ]:
my_cmap = mpl.colors.LinearSegmentedColormap.from_list("",[colors[0],colors[1],colors[2]])



fig,ax = plt.subplots(3,3, figsize=(10,10))

for truth,pred,axes,algo in zip(results_df['Truths_array'],
                                results_df['Preds_array'],
                                ax.ravel(),results_df['Algorithms']):

    
    cf_mat = confusion_matrix(truth,pred)
            
    #### annotations
    labels = ['True Neg','False Pos','False Neg','True Pos']
    counts = ["{0:0.0f}".format(value) for value in cf_mat.flatten()]
    percentages = ["{0:.2%}".format(value) for value in cf_mat.flatten()/np.sum(cf_mat)]
            
    #### final annotations
    label = (np.array([f'{v1}\n{v2}\n{v3}' for v1,v2,v3 in zip(labels,counts,percentages)])).reshape(2,2)
            
    #### scores
    f1 = f1_score(truth,pred)
    auc = roc_auc_score(truth,pred)
    
    #heatmap
    sns.heatmap(data = cf_mat, vmin = 0, vmax =330, cmap = my_cmap,linewidth=2,linecolor = 'white',square = True,
    ax = axes, annot = label, fmt ='', cbar = False, annot_kws = {'fontfamily':'serif','size':10, 'color':'black','weight':'bold','alpha':0.8}, alpha =1)
        
    axes.text(0,-0,'{}'.format(algo),{'fontfamily':'serif','size':12, 'color':'black', 'weight':'bold'})
    
    axes.scatter( 1 , 1 , s = 3500, c = 'white')
    axes.text(0.72,1.1, ' F1: {}\nAUC: {}'.format(round(f1,2), round(auc,2)),{'fontfamily':'serif','size':10, 'color':'black', 'weight':'bold'})
    
    ## ticks and labels
    axes.set_xticklabels('')
    axes.set_yticklabels('')
    
    
    
## titles and text
fig.text(0,1.05,' Crossvalidataion Results', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0,1,'''This Visualization show the results of various classifiers and there respective
results.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.98)



fig.text(0.72,0.,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)


fig.tight_layout(pad = 2.5, w_pad = 2.5,h_pad = 2.5)
fig.show()
In [ ]:
print(color_class.BOLD + 'Hyperparameters tunning grid...'+ color_class.END)
classifiers_params = {
    
    LogisticRegression(): {'C':[0.001,0.01,0.05,0.1,0.5,1,10,100,200,1000] , 
                  'penalty': ['l1','l2']} ,
    
    SVC(): {'C':[0.001,0.01,0.05,0.1,0.5,1,10,100,200,1000] } ,
    

    LGBMClassifier():     {
                  'class_weight': [{1:6,0:4},{1:7,0:3},{1:8,0:4}],
                  'n_estimators': np.arange(100,3000,250),
                  'num_leaves': np.arange(10,50,10),
                  'learning_rate': [0.01,0.05,0.1,0.5]},
                                         
    RandomForestClassifier() :     {
                   'class_weight': [{1:6,0:4}],
                   'max_depth': [2,4,6,8,10],
                   'max_leaf_nodes': [5,10,15],
                   'n_estimators': np.arange(100,2000,500)} ,
    AdaBoostClassifier() :    {
                   'base_estimator': [DecisionTreeClassifier()],
                   'learning_rate': [0.01,0.05,0.1],
                   'n_estimators': np.arange(100,1000,500)} ,
}
Hyperparameters tunning grid...
In [ ]:
print(color_class.BOLD_COLOR+ 'Gridsearch CV implementation with predefined grid params'+ color_class.END +'\n')
xtrain,xtest,ytrain,ytest = train_test_split(xdata,ydata,random_state = 2021,shuffle = True,stratify= ydata)


ss = StandardScaler()

xtrain = ss.fit_transform(xtrain)
xtest = ss.transform(xtest)

best_est = []
best_pms = []

for clf,params in classifiers_params.items():
    print(color_class.BOLD_COLOR + '*'*20 + color_class.END +'\n')
    
    gs = GridSearchCV(estimator= clf, param_grid = params,cv = stratified, verbose= 2,scoring = 'roc_auc',n_jobs = -1)
    #gs = clf
    gs.fit(xtrain,ytrain)
    best_estimator = gs.best_estimator_
    best_params = gs.best_params_
    best_est.append(best_estimator)
    best_pms.append(best_params)
    preds = best_estimator.predict(xtest)
    
    print(color_class.BOLD)
    print(best_estimator)

    print('Accuracy: {}'.format(accuracy_score(ytest,preds)))
    print('Roc_Auc: {}'.format(round(roc_auc_score(ytest,preds),8)))
    
Gridsearch CV implementation with predefined grid params

********************

Fitting 5 folds for each of 20 candidates, totalling 100 fits

LogisticRegression(C=1)
Accuracy: 0.9629629629629629
Roc_Auc: 0.95176499
********************

Fitting 5 folds for each of 10 candidates, totalling 50 fits

SVC(C=1)
Accuracy: 0.9629629629629629
Roc_Auc: 0.95176499
********************

Fitting 5 folds for each of 576 candidates, totalling 2880 fits

LGBMClassifier(class_weight={0: 4, 1: 8}, learning_rate=0.01, n_estimators=2600,
               num_leaves=20)
Accuracy: 0.9629629629629629
Roc_Auc: 0.94680851
********************

Fitting 5 folds for each of 60 candidates, totalling 300 fits

RandomForestClassifier(class_weight={0: 4, 1: 6}, max_depth=8,
                       max_leaf_nodes=15, n_estimators=600)
Accuracy: 0.9629629629629629
Roc_Auc: 0.94680851
********************

Fitting 5 folds for each of 6 candidates, totalling 30 fits

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), learning_rate=0.05,
                   n_estimators=100)
Accuracy: 0.9185185185185185
Roc_Auc: 0.90280464
In [ ]:
print(color_class.BOLD_COLOR+ 'Results appending from gridsearch...'+'\n'+ color_class.END)
acc_list = []
cm_list = []
f1_list = []
auc_list = []
for clf in best_est:
    preds = clf.predict(xtest)
    cm = confusion_matrix(ytest,preds)
    acc = accuracy_score(ytest,preds)
    f1 = f1_score(ytest,preds)
    auc = roc_auc_score(ytest,preds)
    cm_list.append(cm)
    acc_list.append(acc)
    f1_list.append(f1)
    auc_list.append(auc)
hyper_results_df = pd.DataFrame({'Algorithms':['LogisticRegression', 'SVC','LGBMClassifier','RandomForestClassifier','AdaBoostClassifier'],
                                 'Accuracy':acc_list,
                                 'f1_score':f1_list,
                                  'roc_auc_score':auc_list,
                                  'confusion_matrix':cm_list})
print(color_class.BOLD + '\n')
print(hyper_results_df.head())
Results appending from gridsearch...



               Algorithms  Accuracy  f1_score  roc_auc_score  \
0      LogisticRegression  0.962963  0.945055       0.951765   
1                     SVC  0.962963  0.945055       0.951765   
2          LGBMClassifier  0.962963  0.943820       0.946809   
3  RandomForestClassifier  0.962963  0.943820       0.946809   
4      AdaBoostClassifier  0.918519  0.879121       0.902805   

     confusion_matrix  
0  [[87, 1], [4, 43]]  
1  [[87, 1], [4, 43]]  
2  [[88, 0], [5, 42]]  
3  [[88, 0], [5, 42]]  
4  [[84, 4], [7, 40]]  
In [ ]:
my_cmap = mpl.colors.LinearSegmentedColormap.from_list("",[colors[0],colors[1],colors[2]])



fig,ax = plt.subplots(2,2, figsize=(8,8))

for algo,f1,auc,cm,axes in zip(hyper_results_df['Algorithms'],
                                hyper_results_df['f1_score'],
                                hyper_results_df['roc_auc_score'],
                                hyper_results_df['confusion_matrix'],
                                ax.ravel()):

    
    cf_mat = cm
            
    #### annotations
    labels = ['True Neg','False Pos','False Neg','True Pos']
    counts = ["{0:0.0f}".format(value) for value in cf_mat.flatten()]
    percentages = ["{0:.2%}".format(value) for value in cf_mat.flatten()/np.sum(cf_mat)]
            
    #### final annotations
    label = (np.array([f'{v1}\n{v2}\n{v3}' for v1,v2,v3 in zip(labels,counts,percentages)])).reshape(2,2)
            
    #### scores
    
    #heatmap
    sns.heatmap(data = cf_mat, vmin = 0, vmax =84, cmap = my_cmap,linewidth=2,linecolor = 'white',square = True,
    ax = axes, annot = label, fmt ='', cbar = False, annot_kws = {'fontfamily':'serif','size':10, 'color':'black','weight':'bold','alpha':0.8}, alpha =1)
        
    axes.text(0,-0,'{}'.format(algo),{'fontfamily':'serif','size':12, 'color':'black', 'weight':'bold'})
    
    axes.scatter( 1 , 1 , s = 3500, c = 'white')
    axes.text(0.72,1.1, ' F1: {}\nAUC: {}'.format(round(f1,2), round(auc,2)),{'fontfamily':'serif','size':10, 'color':'black', 'weight':'bold'})
    
    ## ticks and labels
    axes.set_xticklabels('')
    axes.set_yticklabels('')
    
    
    
## titles and text
fig.text(0,1.05,' GridSearch Results', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0,1,'''This Visualization show the results of various classifiers and there respective
results.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.98)



fig.text(0.72,0.,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)


fig.tight_layout(pad = 2.5, w_pad = 2.5,h_pad = 2.5)
fig.show()
In [ ]:
models = best_est

stack_train,stack_test = stacking(models = best_est,
                                  X_train = xtrain,
                                  y_train = ytrain,
                                  X_test = xtest,
                                  regression = False, 
                                  metric = 'roc_auc',
                                  n_folds = 5,shuffle = True,
                                  stratified = True)

fin_model = XGBClassifier(eval_metric='logloss')

fin_model.fit(stack_train, ytrain)
fin_preds = fin_model.predict(stack_test)
print(color_class.BOLD_COLOR+'Stacked Classification...'+'\n'+ color_class.END)
print(color_class.BOLD)
print('accuracy: {}'.format(round(accuracy_score(ytest,fin_preds),3)))
print('roc_auc: {}'.format(round(roc_auc_score(ytest,fin_preds),3)))
print('f1:{}'.format(round(f1_score(ytest,fin_preds),3)))
Stacked Classification...


accuracy: 0.963
roc_auc: 0.952
f1:0.945
In [ ]:
models 
Out[ ]:
[LogisticRegression(C=1),
 SVC(C=1),
 LGBMClassifier(class_weight={0: 4, 1: 8}, learning_rate=0.01, n_estimators=2600,
                num_leaves=20),
 RandomForestClassifier(class_weight={0: 4, 1: 6}, max_depth=8,
                        max_leaf_nodes=15, n_estimators=600),
 AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), learning_rate=0.05,
                    n_estimators=100)]
In [ ]:
# my_cmap = mpl.colors.LinearSegmentedColormap.from_list("",[colors[0],colors[1],colors[2]])



fig,ax = plt.subplots(figsize=(8,8))

cf_mat = confusion_matrix(ytest,fin_preds)
f1 = f1_score(ytest,fin_preds)
auc = roc_auc_score(ytest,fin_preds)
            
#### annotations
labels = ['True Neg','False Pos','False Neg','True Pos']
counts = ["{0:0.0f}".format(value) for value in cf_mat.flatten()]
percentages = ["{0:.2%}".format(value) for value in cf_mat.flatten()/np.sum(cf_mat)]
            
#### final annotations
label = (np.array([f'{v1}\n{v2}\n{v3}' for v1,v2,v3 in zip(labels,counts,percentages)])).reshape(2,2)
            
#### scores
    
#heatmap
sns.heatmap(data = cf_mat, vmin = 0, vmax =84, cmap = my_cmap,linewidth=2,linecolor = 'white',square = True,
    ax = ax, annot = label, fmt ='', cbar = False, annot_kws = {'fontfamily':'serif','size':12, 'color':'black','weight':'bold','alpha':0.8}, alpha =1)
   # ax = ax, annot = label, fmt ='', cbar = False, annot_kws = {'fontfamily':'serif','size':12, 'color':'black','weight':'bold','alpha':0.8}, alpha =1)
        
ax.text(0,-0,'{}'.format('Stacked Classification'),{'fontfamily':'serif','size':12, 'color':'black', 'weight':'bold'})
    
ax.scatter( 1 , 1 , s = 5000, c = 'white')
ax.text(0.85,1.05, ' F1: {}\nAUC: {}'.format(round(f1,2), round(auc,2)),{'fontfamily':'serif','size':12, 'color':'black', 'weight':'bold'})
    
## ticks and labels
ax.set_xticklabels('')
ax.set_yticklabels('')
    
    
    
## titles and text
fig.text(0,1.05,' Stacked Classification Results', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0,1,'''This Visualization show the results of stacked classification and there respective
results.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.98)

fig.text(0.72,0.,'© Made by Milon',{'fontfamily':'serif', 'size':12,'weight':'bold'}, alpha = 0.85)


fig.tight_layout(pad = 2.5, w_pad = 2.5,h_pad = 2.5)
fig.show()